In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
from pyspark.conf import SparkConf
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from functools import reduce
import json
import glob
import tempfile


spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()

In [2]:
bronze_df = spark.read.parquet("../data/output/bronze/asset_bronze.parquet")

In [6]:
assets_columns  = {'general':[f'AS{i}' for i in range(1,15) if f'AS{i}' in bronze_df.columns],
                    'obligor_info':[f'AS{i}' for i in range(15,50) if f'AS{i}' in bronze_df.columns],
                    'loan_info':[f'AS{i}' for i in range(50,80) if f'AS{i}' in bronze_df.columns],
                    'interest_rate':[f'AS{i}' for i in range(80,100) if f'AS{i}' in bronze_df.columns],
                    'financial_info':[f'AS{i}' for i in range(100,115) if f'AS{i}' in bronze_df.columns],
                    'performance_info':[f'AS{i}' for i in range(115,146) if f'AS{i}' in bronze_df.columns]}

In [9]:
bronze_df.show(n=5)

+------+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+----------+----+----+-----+----+----+-----+----+----+----+----+----+----+----+----+----------+----------+-----+--------+---------+---------+----+----+--------+------+----+----------+----+----------+----+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+-----+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|   AS3|AS15|AS16| AS17|AS18|AS21|AS22|AS24|AS25|AS26|AS32|AS33|AS34|AS35|AS36| AS42|AS43|AS45|AS52|AS57|AS58|AS59|AS62|AS65|AS68|AS83|AS84|AS89|AS92|AS94|AS111|AS123|AS129|       AS1|AS19|AS20| AS23|AS27|AS28| AS29|AS30|AS31|AS37|AS38|AS39|AS40|AS41|AS44|      AS50|      AS51| AS53|    AS54|     AS55|     AS56|AS60|AS61|    AS63|  AS64|AS66|      AS67|AS69|     

In [11]:
asset_col_type = {'date': ['AS1',
  'AS19',
  'AS20',
  'AS31',
  'AS50',
  'AS51',
  'AS67',
  'AS70',
  'AS71',
  'AS87',
  'AS91',
  'AS112',
  'AS124',
  'AS127',
  'AS130',
  'AS133',
  'AS134',
  'AS137'],
 'string': ['AS2',
  'AS3',
  'AS4',
  'AS5',
  'AS6',
  'AS7',
  'AS8',
  'AS15',
  'AS16',
  'AS17',
  'AS18',
  'AS21',
  'AS22',
  'AS24',
  'AS25',
  'AS26',
  'AS32',
  'AS33',
  'AS34',
  'AS35',
  'AS36',
  'AS42',
  'AS43',
  'AS45',
  'AS52',
  'AS57',
  'AS58',
  'AS59',
  'AS62',
  'AS65',
  'AS68',
  'AS83',
  'AS84',
  'AS89',
  'AS92',
  'AS94',
  'AS111',
  'AS123',
  'AS129'],
 'boolean': ['AS23',
  'AS27',
  'AS28',
  'AS29',
  'AS30',
  'AS37',
  'AS38',
  'AS39',
  'AS40',
  'AS41',
  'AS44',
  'AS53',
  'AS54',
  'AS55',
  'AS56',
  'AS60',
  'AS61',
  'AS63',
  'AS64',
  'AS66',
  'AS69',
  'AS80',
  'AS81',
  'AS82',
  'AS85',
  'AS86',
  'AS88',
  'AS90',
  'AS93',
  'AS100',
  'AS101',
  'AS102',
  'AS103',
  'AS104',
  'AS105',
  'AS106',
  'AS107',
  'AS108',
  'AS109',
  'AS110',
  'AS115',
  'AS116',
  'AS117',
  'AS118',
  'AS119',
  'AS120',
  'AS121',
  'AS122',
  'AS125',
  'AS126',
  'AS128',
  'AS131',
  'AS132',
  'AS135',
  'AS136',
  'AS138']}

In [16]:
# Create datetime dimension table
def process_dates(df,col_types_dict):
    date_cols = [c for c in col_types_dict["date"] if c in df.columns]

    new_df= (
        df.select(F.explode(F.array(date_cols)).alias("date_col"))
        .dropDuplicates()
        .withColumn("unix_date", F.unix_timestamp(F.col("date_col")))
        .withColumn("year", F.year(F.col("date_col")))
        .withColumn("month", F.month(F.col("date_col")))
        .withColumn("quarter", F.quarter(F.col("date_col")))
        .withColumn("WoY", F.weekofyear(F.col("date_col")))
        .withColumn("day", F.dayofmonth(F.col("date_col")))
    )
    return new_df

+----------+----------+----+-----+-------+---+---+
|  date_col| unix_date|year|month|quarter|WoY|day|
+----------+----------+----+-----+-------+---+---+
|2016-03-01|1456790400|2016|    3|      1|  9|  1|
|2045-04-01|2374617600|2045|    4|      2| 13|  1|
|2018-08-10|1533859200|2018|    8|      3| 32| 10|
|2019-11-01|1572566400|2019|   11|      4| 44|  1|
|2030-10-01|1917043200|2030|   10|      4| 40|  1|
|2043-10-01|2327270400|2043|   10|      4| 40|  1|
|2014-08-01|1406851200|2014|    8|      3| 31|  1|
|2025-02-01|1738368000|2025|    2|      1|  5|  1|
|2018-09-01|1535760000|2018|    9|      3| 35|  1|
|2006-12-01|1164931200|2006|   12|      4| 48|  1|
|2016-08-15|1471219200|2016|    8|      3| 33| 15|
|2016-08-31|1472601600|2016|    8|      3| 35| 31|
|2008-02-01|1201824000|2008|    2|      1|  5|  1|
|2017-02-26|1488067200|2017|    2|      1|  8| 26|
|2018-06-30|1530316800|2018|    6|      2| 26| 30|
|2018-11-01|1541030400|2018|   11|      4| 44|  1|
|2026-02-01|1769904000|2026|   

In [21]:
#dte_cols
[c for c in asset_col_type["date"] if c in assets_columns["general"] + assets_columns["obligor_info"]]


['AS1', 'AS19', 'AS20', 'AS31']

In [18]:
# Create Obligor Info dimension table

(
    bronze_df.select(assets_columns["general"] + assets_columns["obligor_info"])
    .withColumn("tmp_AS1", F.unix_timestamp(F.col("AS1"))).drop("AS1").withColumnRenamed("tmp_AS1", "AS1")
    .withColumn("tmp_AS19", F.unix_timestamp(F.col("AS19"))).drop("AS19").withColumnRenamed("tmp_AS19", "AS19")
    .withColumn("tmp_AS20", F.unix_timestamp(F.col("AS20"))).drop("AS20").withColumnRenamed("tmp_AS20", "AS20")
    .withColumn("tmp_AS31", F.unix_timestamp(F.col("AS31"))).drop("AS31").withColumnRenamed("tmp_AS31", "AS31")
    
)

['AS19', 'AS20', 'AS31']