## Importing basic libraries for loading the big datasets which will be used for incremental loading: REVIEW and TIP

This small samples will be uploaded in the Data Lake (S3 Bucket) and then will be uploaded to the DataWarehouse when detected by S3 Sensors on Airflows. 

In [None]:
#### LIBRARIES

#### BASIC
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os 
import json

os.environ["JAVA_HOME"] = "/usr"
os.environ["SPARK_HOME"] = "/opt/spark"
#### SPARK
import findspark
findspark.init()
from pyspark.sql import SparkSession
import pyspark.pandas as ps
import databricks.koalas as ks
spark = SparkSession.builder.master("local[*]").getOrCreate()

#### SETTINGS
%matplotlib inline
spark.sparkContext.setLogLevel("OFF")
ps.options.plotting.backend = 'matplotlib'

In [10]:
review = ks.read_json("./data/review.json", lines= True)
tip = ks.read_json("./data/tip.json", lines= True)
checkin = ks.read_json("./data/checkin.json", lines= True)

In [8]:
def transform_dates(dataframe,column,format):
    """
    This function recieves 1) a dataframe, 2) the name of a column containing timestamp values
    and 3) a date format. It returns the dataframe after transforming the column to the desired 
    format.
    
    Parameters:
    - dataframe: a Koalas dataframe
    - column: the name of the column containing timestamp values
    - format: the datetime format to which the column will be transformed
    """
    series = ks.to_datetime(dataframe[column], errors='coerce')
    mode = series.mode().iloc[0].strftime(format)
    series = series.apply(lambda x: mode if (x is pd.NaT) else x.strftime(format))
    return series

In [None]:
review['date'] = transform_dates(review, 'date', '%Y-%m-%d')

In [None]:
tip['date'] = transform_dates(tip, 'date', '%Y-%m-%d')

In [16]:
review_small = review[review['date']>'2021-10-01']
review_small.to_json('./data/review_small.json')
review_big = review[review['date']<'2021-10-01']
review_big.to_json('./data/review_big.json')

                                                                                

In [19]:
tip_small = tip[tip['date']>'2021-10-01']
tip_small.to_json('./data/tip_small.json')
tip_big = tip[tip['date']<'2021-10-01']
tip_big.to_json('./data/tip_big.json')

                                                                                

## Making a tiny sample to try with S3 BUCKET

TEST = For testing
REST = For uploading the day of the DEMO

In [4]:
import pandas as pd
review_small = pd.read_json('../AIRFLOW-SPARK/data/initial_load/reviews_tiny.json', lines=True)

review_test = review_small[review_small['date']>'2022-01-01']
review_rest = review_small[review_small['date']<'2022-01-01']

In [6]:
review_test.to_json('../AIRFLOW-SPARK/data/initial_load/review_test.json')
review_rest.to_json('../AIRFLOW-SPARK/data/initial_load/review_rest.json')

In [1]:
import pandas as pd
tip_small = pd.read_json('../AIRFLOW-SPARK/data/initial_load/tip_tiny.json', lines=True)

tip_test = tip_small[tip_small['date']>'2022-01-01']
tip_rest = tip_small[tip_small['date']<'2022-01-01']

In [2]:
tip_test.to_json('../AIRFLOW-SPARK/data/initial_load/tip_test.json')
tip_rest.to_json('../AIRFLOW-SPARK/data/initial_load/tip_rest.json')