In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as sqlf 
#col, lit, udf,sum,avg,max,min,mean,count, udf 
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType

In [2]:
spark = SparkSession.builder.appName('Stage - Ingest').getOrCreate()
conf = SparkConf().setAppName('Stage - Ingest')
sc = SparkContext.getOrCreate(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/30 22:05:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [116]:
import json
import logging
from pandas import DataFrame
import os
import pendulum
from datetime import datetime

def get_subdirectories(path: str) -> list:
    """
    Function that searchs for sub-directories inside a directory 
    then returns a list of all the directories names.
    """
    if path.split('/')[7] in ['customer', 'transaction']:
        subdirectories = [f.path for f in os.scandir(path) if f.is_dir()]
        subdirectories = [x.split('/')[8] for x in subdirectories]
        csv_directories = [directory.split('_')[1] for directory in subdirectories]
        return csv_directories
    else:
        subdirectories = [f.path for f in os.scandir(path) if f.is_dir()]
        subdirectories = [x.split('/')[7] for x in subdirectories]
        return [datetime.strptime(directory, '%Y-%m-%d').date() for directory in subdirectories] 
        

def get_latest_folder(folders: list) -> str:
    """
    Returns the latest date from the list.
    """
    return str(max(folders))


def loadJsonData(json_path: str) -> DataFrame:
    """
    Function that returns a dataframe from a valid directory that contains jsonlines files.
    """
    latest = get_latest_folder(get_subdirectories(json_path))
    if os.path.exists(os.path.dirname(os.path.join(json_path, latest))):
        df = spark.read.json(os.path.join(json_path, latest))
        return df
    else:
        return logging.critical('Path to jsonl files doesn\'t exist')


def loadParquetData(parquet_path: str) -> DataFrame:
    """
    Function that returns a dataframe from a valid directory that contains parquet files.
    """
    latest = get_latest_folder(get_subdirectories(parquet_path))
    if os.path.exists(os.path.dirname(parquet_path)):
        df = spark.read.option('recursiveFileLookup', 'true').option('header', 'true').parquet(parquet_path)
        return df
    else:
        return logging.critical('Path to parquet file deosn\'t exist')


def loadCSVData(csv_path: str) -> DataFrame:
    """
    Function that returns a dataframe from a valid directory that contains csv files.
    """
    latest = get_latest_folder(get_subdirectories(csv_path))
    if os.path.exists(os.path.dirname(csv_path)):
        df = spark.read.option('recursiveFileLookup', 'true').option('header', 'true').csv(csv_path)
        return df
    else:
        return logging.critical('Path to csv file doesn\'t exist')

In [120]:
# Ingesting jsonlines data
df = loadJsonData(json_path='/Users/gonzo/Desktop/capstone_project/data_storage/json_storage/')
df = df.select('id', 'ts', 'customer_first_name', 'customer_last_name', 'amount', 'type')
df.show()

+---+-------------------+-------------------+------------------+----------+----+
| id|                 ts|customer_first_name|customer_last_name|    amount|type|
+---+-------------------+-------------------+------------------+----------+----+
|237|2022-09-30T21:58:34|             Thomas|            Booker|    $12.67|   1|
|501|2022-09-30T20:29:21|              Karen|            Vaughn|     $2.85|   0|
|886|2022-10-02T06:15:44|             Donald|             Mason|     $0.76|   0|
|215|2022-10-01T18:21:19|            Jessica|           Hammond|$99,832.19|   0|
|715|2022-09-30T13:42:05|              Brent|           Walters|$84,840.26|   0|
|149|2022-10-01T08:08:07|            Stephen|           Daniels|   $895.46|   1|
|411|2022-10-02T00:40:34|               Jeff|            Murphy|    $66.89|   1|
|618|2022-10-01T18:39:25|              Wyatt|            Glover|$61,080.48|   1|
|401|2022-10-01T03:00:26|            Anthony|            Murray|$77,473.81|   0|
|692|2022-10-01T01:24:53|   

In [121]:
# Ingesting parquet data
df2 = loadParquetData(parquet_path='/Users/gonzo/Desktop/capstone_project/data_storage/parquet_storage/')
df2.show()

+---+----------+---------+----------+-------------------+--------+
| Id|First_name|Last_name|    Amount|          timestamp|Store_id|
+---+----------+---------+----------+-------------------+--------+
| 82|     Larry|Fernandez|    $42.60|2022-10-01T12:51:42|       3|
|817|     Tonya|   Krause|$47,909.91|2022-09-30T18:31:50|      20|
|507|  Danielle|   Garcia|$45,244.89|2022-09-30T08:39:06|       7|
|620|     Kevin|    Ortiz| $3,677.43|2022-09-29T15:52:24|      17|
|818|     Chloe|    Brown|    $51.67|2022-09-29T19:58:04|       8|
|849|      Cory|    Price|   $595.48|2022-09-29T16:26:44|      11|
|369|     David|    Green|   $454.53|2022-09-29T21:35:23|      10|
|764|    Shelby|  Miranda|   $527.20|2022-09-29T14:27:13|       5|
|431|      Tina|   Chavez|$58,514.39|2022-09-30T18:19:56|      14|
|155|   Natalie|Zimmerman|$91,667.09|2022-10-01T08:03:20|       3|
|656|   Kristen| Mcdonald|    $90.58|2022-09-30T03:48:51|       2|
|590|   Michael| Hamilton|   $834.01|2022-10-01T07:57:12|     

In [117]:
# Ingesting RDBMS (PostgreSQL) data
df3 = loadCSVData(csv_path='/Users/gonzo/Desktop/capstone_project/data_storage/pgdata/customer/')
df3.show()

+---+-----------+---------+------------+-----------------+
| id| first_name|last_name|phone_number|          address|
+---+-----------+---------+------------+-----------------+
| 21|     Alicia|   Glover|  6521283839|         Allenton|
|239|   Jennifer|  Herrera|  1300415237|         Allenton|
|781|      Jared|    Dixon|  2104822627|New Marthaborough|
|886|     Donald|   Wagner|  1620409568|    Port Loriview|
| 20|       Ruth|    Fritz|  5322972176|    Michelleburgh|
|220|    Gregory|    Hanna|  6268379314|      Melissafurt|
|291|     Yvonne|    Miles|  1944254819|     Brownchester|
|113|    Vincent|    Green|  1241766638|         Clayview|
|319|    Valerie|   Harper|  1601056191|      Veronicaton|
|651|    Jeffrey|Hernandez|  5212825535|      Melissafurt|
|890|      Jason|  Bentley|  5644645140|         Allenton|
|372|      Kevin|    Parks|  0137713202|       Grahamstad|
|202|Christopher|  Acevedo|  0821339386|New Marthaborough|
|810|      Jason|     Pena|  6767859520|Port Benjaminfur

In [118]:
# Ingesting RDBMS (PostgreSQL) data
df4 = loadCSVData(csv_path='/Users/gonzo/Desktop/capstone_project/data_storage/pgdata/transaction/')
df4.show()

+---+-----------+-------------------+----------+
| id|customer_id|     transaction_ts|    amount|
+---+-----------+-------------------+----------+
|329|        834|2022-10-03 07:04:41| $7,946.15|
|593|        511|2022-09-30 15:52:16|     $0.14|
|399|        106|2022-10-02 16:50:15|     $8.80|
|496|         15|2022-10-02 11:56:02|    $94.76|
| 17|        865|2022-10-03 06:36:50|   $834.31|
|677|        791|2022-10-01 21:54:38| $5,885.03|
|127|         53|2022-10-01 05:31:38| $9,117.86|
|629|        441|2022-10-02 17:50:04|   $379.96|
|142|        386|2022-10-03 04:25:02|     $3.95|
|752|        530|2022-10-02 21:02:32|$86,170.73|
|117|        123|2022-09-30 16:09:27|    $67.56|
|188|        134|2022-10-03 06:58:07|$33,515.66|
| 32|        779|2022-10-03 11:51:13|   $657.09|
|574|        326|2022-10-03 06:52:25|   $870.84|
|122|        313|2022-10-01 12:51:34|    $12.51|
|858|        597|2022-10-01 15:30:48|   $214.21|
|512|        401|2022-10-02 12:01:20|   $404.07|
|732|         70|202

In [147]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- ts: string (nullable = true)
 |-- customer_first_name: string (nullable = true)
 |-- customer_last_name: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- type: long (nullable = true)



In [143]:
df2.printSchema()

root
 |-- Id: long (nullable = true)
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- Amount: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- Store_id: long (nullable = true)



In [144]:
df3.printSchema()

root
 |-- id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- address: string (nullable = true)



In [145]:
df4.printSchema()

root
 |-- id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- transaction_ts: string (nullable = true)
 |-- amount: string (nullable = true)



In [169]:
df2 = df2.withColumnRenamed('First_name', 'customer_first_name')
df2 = df2.withColumnRenamed('Last_name', 'customer_last_name')
df2 = df2.withColumnRenamed('Amount', 'amount')
df = df.withColumnRenamed('ts', 'timestamp')
df = df.withColumnRenamed('Store_id', 'store_id')
df2.show()

+---+-------------------+------------------+----------+-------------------+--------+
| Id|customer_first_name|customer_last_name|    amount|          timestamp|Store_id|
+---+-------------------+------------------+----------+-------------------+--------+
| 82|              Larry|         Fernandez|    $42.60|2022-10-01T12:51:42|       3|
|817|              Tonya|            Krause|$47,909.91|2022-09-30T18:31:50|      20|
|507|           Danielle|            Garcia|$45,244.89|2022-09-30T08:39:06|       7|
|620|              Kevin|             Ortiz| $3,677.43|2022-09-29T15:52:24|      17|
|818|              Chloe|             Brown|    $51.67|2022-09-29T19:58:04|       8|
|849|               Cory|             Price|   $595.48|2022-09-29T16:26:44|      11|
|369|              David|             Green|   $454.53|2022-09-29T21:35:23|      10|
|764|             Shelby|           Miranda|   $527.20|2022-09-29T14:27:13|       5|
|431|               Tina|            Chavez|$58,514.39|2022-09-30

In [173]:
# Parquet & Json
unified = df.join(df2, ['id', 'customer_first_name', 'customer_last_name', 'timestamp', 'amount'], "fullouter")
unified = unified.select('id', 'type', 'store_id', 'amount', 'customer_first_name', 'customer_last_name', 'timestamp')
unified.show()

+---+----+--------+----------+-------------------+------------------+-------------------+
| id|type|store_id|    amount|customer_first_name|customer_last_name|          timestamp|
+---+----+--------+----------+-------------------+------------------+-------------------+
|  2|null|       6| $8,668.73|               Kurt|         Rodriguez|2022-09-30T08:55:53|
|  4|   0|    null|   $790.01|             Samuel|          Hamilton|2022-10-02T11:31:44|
|  7|   1|    null| $1,010.61|             Jeremy|              Buck|2022-10-01T16:27:35|
|  9|null|       7|    $13.65|              Nancy|              Hunt|2022-09-29T08:39:24|
|  9|   1|    null|   $561.87|          Stephanie|              Vega|2022-10-01T08:10:29|
| 11|null|       4|    $89.23|            Annette|             Zhang|2022-10-02T16:03:50|
| 12|null|       1| $9,676.99|            Michael|             Lewis|2022-10-01T11:33:34|
| 13|   0|    null|    $82.42|              Robin|            Santos|2022-10-03T02:35:39|
| 16|null|

In [175]:
# Both dataframes from postgresql
unified2 = df3.join(df4, ['id'], "fullouter")
unified2 = unified2.select('id', 'customer_id', 'amount', 'first_name', 'last_name', 'phone_number', 'address', 'transaction_ts')
unified2.show()

+---+-----------+---------+----------+---------+------------+---------------+-------------------+
| id|customer_id|   amount|first_name|last_name|phone_number|        address|     transaction_ts|
+---+-----------+---------+----------+---------+------------+---------------+-------------------+
| 10|       null|     null|    Edward| Williams|  4964165436|      Erikville|               null|
| 10|       null|     null|    Edward| Williams|  4964165436|      Erikville|               null|
|100|        360|  $309.26|      Sean|   Romero|  2613248986|      Hicksview|2022-10-01 07:54:12|
|100|        360|  $309.26|     James|  Johnson|  4648983930|    Melissafurt|2022-10-01 07:54:12|
|100|        360|  $309.26|      Sean|   Romero|  2613248986|      Hicksview|2022-10-01 07:54:12|
|100|        360|  $309.26|     James|  Johnson|  4648983930|    Melissafurt|2022-10-01 07:54:12|
|101|       null|     null|   Kathryn|    Burns|  5644166550|      Hicksview|               null|
|101|       null|   

In [176]:
unified.printSchema()

root
 |-- id: long (nullable = true)
 |-- type: long (nullable = true)
 |-- store_id: long (nullable = true)
 |-- amount: string (nullable = true)
 |-- customer_first_name: string (nullable = true)
 |-- customer_last_name: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [179]:
unified2 = unified2.withColumnRenamed('first_name', 'customer_first_name')
unified2 = unified2.withColumnRenamed('last_name', 'customer_last_name')
unified2 = unified2.withColumnRenamed('transaction_ts', 'timestamp')
unified2.printSchema()

root
 |-- id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- customer_first_name: string (nullable = true)
 |-- customer_last_name: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- address: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [187]:
unified_model = unified.join(unified2, ['id', 'amount', 'customer_first_name', 'customer_last_name', 'timestamp'], 'fullouter')
unified_model = unified_model.select('id', 'customer_id', 'store_id', 'type', 'amount', 'customer_first_name', 'customer_last_name', 'phone_number', 'address', 'timestamp')
unified_model.show(2134)

+---+-----------+--------+----+----------+-------------------+------------------+------------+-----------------+-------------------+
| id|customer_id|store_id|type|    amount|customer_first_name|customer_last_name|phone_number|          address|          timestamp|
+---+-----------+--------+----+----------+-------------------+------------------+------------+-----------------+-------------------+
|  2|       null|    null|null|      null|             George|           Whitney|  3460610183|     West Nichole|               null|
|  2|       null|    null|null|      null|             George|           Whitney|  3460610183|     West Nichole|               null|
|  3|       null|    null|null|      null|              Scott|         Mcfarland|  3836366169|         Clayview|               null|
|  3|       null|    null|null|      null|              Scott|         Mcfarland|  3836366169|         Clayview|               null|
|  5|       null|    null|null|      null|           Jennifer|       

In [204]:
unified_model.write.csv('/Users/gonzo/Desktop/capstone_project/data_storage/storage/test1')