In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as sqlf 
#col, lit, udf,sum,avg,max,min,mean,count, udf 
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType

In [2]:
spark = SparkSession.builder.appName('Stage - Ingest').getOrCreate()
conf = SparkConf().setAppName('Stage - Ingest')
sc = SparkContext.getOrCreate(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/30 22:05:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [41]:
import json
import logging
from pandas import DataFrame
import os
import pendulum
from datetime import datetime

def get_jsonl_folders(path: str) -> list:
    """
    Function that searchs for sub-directories inside a directory 
    then returns a list of all the directories names.
    """
    subdirectories = [f.path for f in os.scandir(path) if f.is_dir()]
    subdirectories = [x.split('/')[7] for x in subdirectories]
    return [datetime.strptime(directory, '%Y-%m-%d').date() for directory in subdirectories]
    

def get_latest_folder(folders: list) -> str:
    """
    Returns the latest date from the list.
    """
    return str(max(folders))


def loadJsonData(json_path: str) -> DataFrame:
    """
    Function that returns a dataframe from a valid directory that contains jsonlines files.
    """
    latest = get_latest_folder(get_jsonl_folders(json_path))
    if os.path.exists(os.path.dirname(os.path.join(json_path, latest))):
        df = spark.read.json(os.path.join(json_path, latest))
        return df
    else:
        return logging.critical('Path to jsonl files doesn\'t exist')


def loadParquetData(parquet_path: str) -> DataFrame:
    """
    Function that returns a dataframe from a valid directory that contains parquet files.
    """
    latest = get_latest_folder(get_jsonl_folders(parquet_path))
    if os.path.exists(os.path.dirname(parquet_path)):
        df = spark.read.option('recursiveFileLookup', 'true').option('header', 'true').parquet(parquet_path)
        return df
    else:
        return logging.critical('Path to parquet file deosn\'t exist')


def loadCSVData(csv_path: str) -> DataFrame:
    """
    Function that returns a dataframe from a valid directory that contains csv files.
    """
    latest = get_latest_folder(get_jsonl_folders(csv_path))
    if os.path.exists(os.path.dirname(csv_path)):
        df = spark.read.option('recursiveFileLookup', 'true').option('header', 'true').csv(csv_path)
        return df
    else:
        return logging.critical('Path to csv file doesn\'t exist')

In [42]:
# Ingesting jsonlines data
df = loadJsonData(json_path='/Users/gonzo/Desktop/capstone_project/data_storage/json_storage/')
df.show()

+-------+-------------------+------------------+---+-------------------+----+
| amount|customer_first_name|customer_last_name| id|                 ts|type|
+-------+-------------------+------------------+---+-------------------+----+
|$293.92|             Isabel|               Kim| 10|2022-09-29T16:15:36|   0|
+-------+-------------------+------------------+---+-------------------+----+



In [43]:
# Ingesting parquet data
df2 = loadParquetData(parquet_path='/Users/gonzo/Desktop/capstone_project/data_storage/parquet_storage/')
df2.show()

[Stage 10:>                                                         (0 + 1) / 1]

+---+----------+---------+----------+-------------------+--------+
| Id|First_name|Last_name|    Amount|          timestamp|Store_id|
+---+----------+---------+----------+-------------------+--------+
|408|     Julia|   Garcia|     $5.78|2022-09-29T05:10:10|      14|
| 45|   Jeffrey|    Wells| $5,706.18|2022-09-28T01:16:37|      15|
|788|   Richard|   Larsen|    $19.08|2022-09-28T12:26:47|      14|
|719|    Daniel|   Taylor|   $762.91|2022-09-29T10:51:51|       9|
|249|     James|    Smith|$26,894.90|2022-09-28T00:14:01|       5|
|466|   Deborah|     Wade|    $79.90|2022-09-28T21:14:59|       4|
|793|   William|    Smith|$27,596.28|2022-09-28T11:45:09|       7|
| 29|   Richard|     Ford|$48,422.07|2022-09-30T18:54:09|       7|
|824|     Tonya|   Farmer|     $0.60|2022-09-30T16:20:15|       5|
|151|   Michael|   Obrien|    $56.83|2022-09-27T23:12:57|       7|
|729| Elizabeth|    Scott|    $97.40|2022-09-29T13:30:30|      10|
|650|     Jason|     Ward|     $3.36|2022-09-28T02:55:47|     

                                                                                

In [44]:
# Ingesting RDBMS (PostgreSQL) data
df3 = loadCSVData(csv_path='/Users/gonzo/Desktop/capstone_project/data_storage/csv_storage')
df3.show()

+---+----------+---------+------------+-------------------+
| id|first_name|last_name|phone_number|            address|
+---+----------+---------+------------+-------------------+
|101|   Matthew|    Brown|  5269187503|       East Russell|
|102|   Anthony|    Patel|  0565170581|    New Angelaville|
|103|    Hannah|Fernandez|  6556807051|         Port James|
|104| Alejandra|    Flynn|  9902275041|         Moraleston|
|105|      Sean|   Lester|  8516957937|     North Courtney|
|106|   Maxwell|    Lewis|  7881815482|         West Kelly|
|107|   Kaitlin|   Warren|  6668531685|      North Kaitlyn|
|108|    Leslie| Ferguson|  9427440597|     Frederickmouth|
|109|  Samantha|     Mann|  8748732685|        Matthewside|
|110|    Joseph|   Morgan|  7850146994|           Adamside|
|111|     Scott| Trujillo|  2281479263|   West Samuelmouth|
|112|     James|  Simpson|  5820604215|          Brownstad|
|113|  Michelle|   Rogers|  2928540921|    Port Josephbury|
|114|   William|   Meyers|  1688619356| 