In [18]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
from pyspark.conf import SparkConf
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from functools import reduce
import json
import glob
import tempfile


spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()

In [19]:
bronze_df = spark.read.parquet("../data/output/bronze/bond_info_bronze.parquet")

In [20]:
bronze_df.show(n=100)

+----------+--------------------+----------+----------+-----+---+----+-----+----+--------------------+--------------------+-------+------------+----------+----------+----+------------+------------+----+----------+-----------+----------+----+----+----------+----------+
|       BS1|                 BS2|       BS3|       BS4|  BS5|BS6|BS11| BS12|BS13|                BS19|                BS20|   BS25|        BS26|      BS27|      BS28|BS29|        BS30|        BS31|BS32|      BS33|       BS34|      BS35|BS36|BS37|      BS38|      BS39|
+----------+--------------------+----------+----------+-----+---+----+-----+----+--------------------+--------------------+-------+------------+----------+----------+----+------------+------------+----+----------+-----------+----------+----+----+----------+----------+
|2020-02-23|LANTERNA FINANCE ...|5200000.00|5200000.00|False|EUR|null|False|0.00|0039 10 5794204 c...|0039 10 5794204 c...|CLASS B|IT0005333817|2019-10-28|2019-10-28| EUR|153000000.00|153000000

In [21]:
bond_columns  = {'bond_info':[f'BS{i}' for i in range(1,11) if f'BS{i}' in bronze_df.columns],
                    'collateral_info':[f'BS{i}' for i in range(11,19) if f'BS{i}' in bronze_df.columns],
                    'contact_info':[f'BS{i}' for i in range(19,25) if f'BS{i}' in bronze_df.columns]}

In [26]:
# Create Bond Info table
def process_bond_info(df, cols_dict):
    new_df = (
        df
        .select(cols_dict["bond_info"])
        .withColumn("tmp_BS1", F.unix_timestamp(F.to_timestamp(F.col("BS1"), "yyyy-MM-dd"))).drop("BS1").withColumnRenamed("tmp_BS1", "BS1")
    )
    return new_df

In [27]:
# Create Collateral Info table
def process_collateral_info(df, cols_dict):
    new_df = (
        df
        .select(["BS1", "BS2"] + cols_dict["collateral_info"])
        .withColumn("tmp_BS1", F.unix_timestamp(F.to_timestamp(F.col("BS1"), "yyyy-MM-dd"))).drop("BS1").withColumnRenamed("tmp_BS1", "BS1")
    )
    return new_df

In [28]:
# Create Contact Info table
def process_contact_info(df, cols_dict):
    new_df = (
        df
        .select(["BS1", "BS2"] + cols_dict["contact_info"])
        .withColumn("tmp_BS1", F.unix_timestamp(F.to_timestamp(F.col("BS1"), "yyyy-MM-dd"))).drop("BS1").withColumnRenamed("tmp_BS1", "BS1")
    )
    return new_df

In [29]:
# Write down examples
df = process_bond_info(bronze_df,bond_columns)
df.write.parquet("../data/output/silver/bonds/info.parquet")

df = process_collateral_info(bronze_df,bond_columns)
df.write.parquet("../data/output/silver/bonds/collateral_info.parquet")

df = process_contact_info(bronze_df,bond_columns)
df.write.parquet("../data/output/silver/bonds/contact_info.parquet")
