In [None]:
from datetime import date

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, lit

spark = SparkSession.builder \
    .appName("edwin_app") \
    .config("spark.some.config.option", "valor") \
    .getOrCreate()

In [32]:

def read_data(path: str) -> DataFrame:
    """Read data from multiples tables."""
    df = (spark.read
          .option('header', True)
          .option('delimiter', ',')
          .csv(path)
          )

    return df


def merging_all_tables(df1: DataFrame,
                       df2: DataFrame,
                       df3: DataFrame,
                       df4: DataFrame) -> DataFrame:
    """Join 4 tables to consolid all information into a big table."""
    df = df1.join(df2,
                  on='policy_type_id',
                  how='left'
                  ).join(df3,
                         on='policy_lvl_id',
                         how='left'
                         ).join(df4,
                                on='state_id',
                                how='left'
                                )

    return df


def drop_unnecessary_columns(df: DataFrame) -> DataFrame:
    """Drop columns that are not necessary in raw stage."""
    df = df.drop('policy_type_id',
                 'policy_lvl_id',
                 'state_id')

    return df



def save_raw_data(df: DataFrame,
                  path: str) -> None:
    """Send data proccesed to raw stage."""
    today_date = date.today()
    print(today_date)

    df = df.select("*",
                   lit(today_date).alias("load_date"))
    df.coalesce(1).write.mode('overwrite').parquet(path)

FUNCTIONS ORCHESTRATION

In [33]:
basepath = './data/edwin/tables'

transactions_df = read_data(f'{basepath}/transactions.csv')
policies_levels_df = read_data(f'{basepath}/policies_levels.csv')
policies_types_df = read_data(f'{basepath}/policies_types.csv')
states_df = read_data(f'{basepath}/states.csv')

In [34]:
print(policies_types_df.show(3, False))
print(policies_levels_df.show(3, False))
print(states_df.show(3, False))
print(transactions_df.show(3, False))

+--------------+--------------+
|policy_type_id|Policy Type   |
+--------------+--------------+
|1             |Corporate Auto|
|2             |Personal Auto |
|3             |Special Auto  |
+--------------+--------------+

None
+-------------+------------+
|policy_lvl_id|Policy      |
+-------------+------------+
|1            |Corporate L3|
|2            |Personal L3 |
|3            |Corporate L2|
+-------------+------------+
only showing top 3 rows

None
+--------+----------+
|state_id|State     |
+--------+----------+
|1       |Washington|
|2       |Arizona   |
|3       |Nevada    |
+--------+----------+
only showing top 3 rows

None
+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+-------------------------+------------------+----------------+-------------+------------------+-------------+------------+--------------+-

In [35]:
all_tables_df = merging_all_tables(df1=transactions_df,
                              df2=policies_types_df,
                              df3=policies_levels_df,
                              df4=states_df)

In [36]:
final_df = drop_unnecessary_columns(all_tables_df)

In [38]:
raw_stage_path = './outputs/edwin/raw_stage/all_policies_table'

save_raw_data(final_df,
              raw_stage_path)

2025-05-27


In [39]:
today_date = date.today()

spark.read.parquet(raw_stage_path).filter(col("load_date") == lit(today_date)).show(5, False)

+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+-------------------------+------------------+----------------+-------------+------------------+-------------+------------+--------------+------------+----------+----------+
|Customer|Customer Lifetime Value|Response|Coverage|Education|Effective To Date|EmploymentStatus|Gender|Income|Location Code|Marital Status|Monthly Premium Auto|Months Since Last Claim|Months Since Policy Inception|Number of Open Complaints|Number of Policies|Renew Offer Type|Sales Channel|Total Claim Amount|Vehicle Class|Vehicle Size|Policy Type   |Policy      |State     |load_date |
+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+---------

In [31]:
print(today_date)

2025-05-27


In [13]:
final_df.show(5, False)

+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+-------------------------+------------------+----------------+-------------+------------------+-------------+------------+--------------+------------+----------+
|Customer|Customer Lifetime Value|Response|Coverage|Education|Effective To Date|EmploymentStatus|Gender|Income|Location Code|Marital Status|Monthly Premium Auto|Months Since Last Claim|Months Since Policy Inception|Number of Open Complaints|Number of Policies|Renew Offer Type|Sales Channel|Total Claim Amount|Vehicle Class|Vehicle Size|Policy Type   |Policy      |State     |
+--------+-----------------------+--------+--------+---------+-----------------+----------------+------+------+-------------+--------------+--------------------+-----------------------+-----------------------------+-------------------------+-----