In [None]:
import pandas as pd

from functools import reduce
from pyspark.sql.functions import date_format, to_date, col, year, month, dayofmonth, when, lag, udf, collect_list
from pyspark.sql.functions import sum as spark_sum, count as spark_count, avg as spark_avg
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession, Window

In [None]:
pd.set_option('display.max_columns', 10000000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.width', 10000000)

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining Spark Session for pseudo-distributed computing:

In [None]:
spark = SparkSession.builder.appName('Transform_Data').getOrCreate()
sc = spark.sparkContext
sc

# Loading CSV daily price Funds file.

In [None]:
portfolio_path_file = 'data-resources/data.csv'
portfolio_data = spark.read.format("csv").options(header="true").load(portfolio_path_file)

In [None]:
test_d = "data-resources/data_structure_sample.csv"
test_df = spark.read.format("csv").options(header="true").load(test_d)

In [None]:
test_df.toPandas()

## Change impure schema portfolio input data.

In [None]:
# Defining portfolio dataframe data:
schema_portfolio = [date_format(
    to_date(col(portfolio_data.columns[0]), 'dd/MM/yyyy'),
    'yyyy-MM-dd').cast('date').alias('operation_date')] + [col(x).cast('float') for x in portfolio_data.columns[1:]]

In [None]:
# Filtering operation dates without nulls:
portfolio_data_ns = portfolio_data.where(col(portfolio_data.columns[0]).isNotNull())\
                                  .select(schema_portfolio)

portfolio_data_ns.printSchema()

In [None]:
#partition_field_mod1 = ['operation_date']
#writing_path_mod1 = '/data/core/fince/data/portfolioOptimization/price_wharehouse_transform/'
#print('\nWriting parquets ...\n')
#portfolio_data_ns.repartition(1).write.mode('overwrite').parquet(writing_path_mod1, partitionBy=partition_field_mod1)

#%time
#print('\nSUCCESS \nPARQUET DATA SAVED!')
#print('\nNew root path table data:', writing_path_mod1+'operation_date=yyy-MM-dd', '\nparquet chunks portitioned by:', partition_field_mod1)

portfolio_path_parquet = '/data/core/fince/data/portfolioOptimization/price_wharehouse_transform/'
portfolio_df = spark.read.parquet(portfolio_path_parquet)

## Defining Year parameters input array:

In [None]:
year_param_1, year_param_2 = 2016, 2019
year_array = list(range(year_param_1, year_param_2+1))
print('Year filter array parameters:', year_array)

In [None]:
portfolio_dates = portfolio_df.select('*',
                                      year("operation_date").alias('year'), 
                                      month("operation_date").alias('month'), 
                                      dayofmonth("operation_date").alias('day'))

# Cleaning data, analytic base table structuration.

In [None]:
def dates_index(dates_list):
    """
    Dates parser function, transform a list of dates in a dictionary
    :param dates_list: list with date values
    :return: parser udf for sequence of dates
    """
    if not isinstance(dates_list, list):
        raise PythagorasUtilsException('Invalid param')

    if len(dates_list) <= 0:
        raise PythagorasUtilsException('Empty param')

    dates_dict = {date: index for index, date in enumerate(dates_list)}
    result = udf(lambda x: dates_dict[x], IntegerType())

    return result

In [None]:
operation_dates_list = sorted([x.operation_date for x in portfolio_dates.select('operation_date').distinct().collect()])
print("unique dates list:",len(operation_dates_list))

In [None]:
date_index_udf = dates_index(operation_dates_list)

In [None]:
debugging_portfolio = portfolio_dates.where(col('year').isin(year_array)).select('*', (date_index_udf(col('operation_date'))).alias('date_id'))
debugging_portfolio.orderBy(col('operation_date')).limit(10).toPandas()

In [None]:
long_cols = debugging_portfolio.columns[:-5]
count_by_col = [spark_sum(col(x)).alias(str(x)) for x in long_cols]
aggregate_columns = debugging_portfolio.select(*count_by_col)

In [None]:
aggregate_columns.limit(5).toPandas()

In [None]:
# removing none type data:
null_counts = aggregate_columns.select([spark_count(when(col(c).isNull(), c)).alias(c) for c in aggregate_columns.columns]).collect()[0].asDict()
drop_cols = [k for k, v in null_counts.items() if v > 0]
removed_errors = debugging_portfolio.drop(*drop_cols)

In [None]:
removed_errors.limit(5).toPandas()

In [None]:
# removing NaN & fit vectors with no more than 10 NaN's (days):
missing_counter = removed_errors.select([spark_count(when(col(c).isNull(), c)).alias(c) for c in removed_errors.columns]).collect()[0].asDict()
drop_rude_missing = [k for k, v in missing_counter.items() if v >= 10]
remove_rude_missing = removed_errors.drop(*drop_rude_missing)

In [None]:
remove_rude_missing.orderBy("operation_date").toPandas()

In [None]:
numerical_fields = remove_rude_missing.agg(*(spark_avg(c).alias(c) for c in remove_rude_missing.columns if c not in ['operation_date']))
purifying_portfolio = remove_rude_missing.na.fill(numerical_fields.first().asDict())

In [None]:
w = Window.orderBy("operation_date")
yield_cols = purifying_portfolio.columns[:-5]
yield_portfolio = (reduce(lambda r_df, col_name: r_df.withColumn(col_name, r_df[col_name] 
                                                              / (lag(r_df[col_name]).over(w))-1), yield_cols, purifying_portfolio))\
                                                     .where(col(yield_cols[0]).isNotNull())

In [None]:
yield_portfolio.printSchema()

In [None]:
yield_portfolio.orderBy("operation_date").toPandas()

In [None]:
yield_portfolio_df = yield_portfolio.select(*yield_portfolio.columns[:-4])
yield_portfolio_df.orderBy("operation_date").toPandas()

In [None]:
yield_portfolio_df.describe("SCOTIAG","AXESCP","BMERGOB","BMRGOB25","VALUEF4","BLKDIA7").toPandas()

# Writing Portfolio's Yield dataframe.

In [None]:
partition_field_mod2 = ['operation_date']
writing_path_mod2 = '/data/core/fince/data/portfolioOptimization/portfolio_yield_window/'

print('\nWriting parquets ...')
yield_portfolio_df.coalesce(1).write.mode('overwrite').parquet(writing_path_mod2, partitionBy=partition_field_mod2)

%time
print('\nSUCCESS \nPARQUET DATA SAVED!')
print('\nNew root path tabla data:', writing_path_mod2 + 'operation_date=yyy-MM-dd', '\nparquet chunks portitioned by:', partition_field_mod2)