In [42]:
from pyspark.sql import DataFrame, SparkSession

spark = SparkSession.builder \
    .appName("edwin_app") \
    .config("spark.some.config.option", "valor") \
    .getOrCreate()

In [49]:

def read_data(path: str) -> DataFrame:
    """Function to read data from multiples tables
    """
    df = (spark.read
          .option('header', True)
          .option('delimiter', ',')
          .csv(path)
          )

    return df


def merging_all_tables(df1: DataFrame,
                       df2: DataFrame,
                       df3: DataFrame,
                       df4: DataFrame) -> DataFrame:
    """Function to join 4 tables to consolid all information into a big table
    """
    df = df1.join(df2,
                  on='policy_type_id',
                  how='left'
                  ).join(df3,
                         on='policy_lvl_id',
                         how='left'
                         ).join(df4,
                                on='state_id',
                                how='left'
                                )

    return df


def drop_unnecessary_columns(df: DataFrame) -> DataFrame:
    """Function to drop columns that are not necessary in raw stage
    """
    df = df.drop('policy_type_id',
                 'policy_lvl_id',
                 'state_id')

    return df



def save_raw_data(df: DataFrame,
                  path: str) -> None:
    """Function to send data proccesed to raw stage
    """
    df.coalesce(1).write.option('header', True).mode('overwrite').csv(path)

FUNCTIONS ORCHESTRATION

In [44]:
basepath = './data/edwin/tables'

transactions_df = read_data(f'{basepath}/transactions.csv')
policies_levels_df = read_data(f'{basepath}/policies_levels.csv')
policies_types_df = read_data(f'{basepath}/policies_types.csv')
states_df = read_data(f'{basepath}/states.csv')

In [45]:
all_tables_df = merging_all_tables(df1=transactions_df,
                              df2=policies_types_df,
                              df3=policies_levels_df,
                              df4=states_df)

In [46]:
final_df = drop_unnecessary_columns(all_tables_df)

In [52]:
raw_stage_path = './outputs/edwin/raw_stage/all_policies_table'

save_raw_data(final_df,
              raw_stage_path)

In [51]:
final_df.show(5, False)

+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+-------------------------+------------------+----------------+-------------+------------------+-------------+------------+--------------+------------+----------+
|Customer|Customer Lifetime Value|Response|Coverage|Education|Effective To Date|EmploymentStatus|Gender|Income|Location Code|Marital Status|Monthly Premium Auto|Months Since Last Claim|Months Since Policy Inception|Number of Open Complaints|Number of Policies|Renew Offer Type|Sales Channel|Total Claim Amount|Vehicle Class|Vehicle Size|Policy Type   |Policy      |State     |
+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+-------------------------+-----