In [None]:
import pandas as pd

from itertools import chain
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, to_date, col, year, month, dayofmonth, sum as spark_sum, when, create_map, lit, explode

In [None]:
pd.set_option('display.max_columns', 10000000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.width', 10000000)

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining Spark Session for pseudo-distributed computing:

In [None]:
spark = SparkSession.builder.appName('Exploratory_Analysis').getOrCreate()
sc = spark.sparkContext
sc

# Loading CSV daily price Funds file.

In [None]:
portfolio_path_file = 'data-resources/data.csv
portfolio_data = spark.read.format("csv").options(header="true").load(portfolio_path_file)

# Change impure schema portfolio input data.
### Defining portfolio dataframe data:

In [None]:
schema_portfolio = [date_format(
    to_date(col(portfolio_data.columns[0]), 'dd/MM/yyyy'),
    'yyyy-MM-dd').cast('date').alias('operation_date')] + [col(x).cast('float') for x in portfolio_data.columns[1:]]

### Filtering operation dates without nulls:

In [None]:
portfolio_data_ns = portfolio_data.where(col(portfolio_data.columns[0]).isNotNull())\
                                        .select(schema_portfolio)

portfolio_data_ns.printSchema()

In [None]:
partition_field_mod1 = ['operation_date']
writing_path_mod1 = '/data/core/fince/data/portfolioOptimization/price_wharehouse_transform/'
print('\nWriting parquets ...\n')
portfolio_data_ns.repartition(1).write.mode('overwrite').parquet(writing_path_mod1, partitionBy=partition_field_mod1)

%time
print('\nSUCCESS \nPARQUET DATA SAVED!')
print('\nNew root path table data:', writing_path_mod1+'operation_date=yyy-MM-dd', '\nparquet chunks portitioned by:', partition_field_mod1)

portfolio_path_parquet = '/data/core/fince/data/portfolioOptimization/price_wharehouse_transform/'
portfolio_df = spark.read.parquet(portfolio_path_parquet)

### Year parameters input array:

In [None]:
year_param_1, year_param_2 = 2016, 2019
year_array = list(range(year_param_1, year_param_2+1))
print('Year filter array parameters:', year_array)

In [None]:
portfolio_dates = portfolio_df.select('*',
                                      year("operation_date").alias('year'), 
                                      month("operation_date").alias('month'), 
                                      dayofmonth("operation_date").alias('day'))

# Exploring dataframe portfolio funds data.

In [None]:
count_tickers_agg = [spark_sum(when(col(x).isNotNull(), 1).otherwise(0)).alias('count_' + str(x)) for x in portfolio_dates.columns[1:-3]]
portfolio_dates_agg = portfolio_dates.groupBy('year')\
                                     .agg(*count_tickers_agg)\
                                     .orderBy('year')

In [None]:
portfolio_dates_agg.toPandas()

In [None]:
count_by_year = [spark_sum(when(col(x) > 0, 1).otherwise(0)).alias('ticker_' + str(x[6:])) for x in portfolio_dates_agg.columns[1:]]

In [None]:
portfolio_year_count = portfolio_dates_agg.where(col('year').isin(*year_array)).select(*count_by_year)
portfolio_year_count.toPandas()

In [None]:
field_and_values = create_map(list(chain.from_iterable([[lit(c), col(c)] for c in portfolio_year_count.columns[:-1]])))

In [None]:
portfolio_T = portfolio_year_count.select(*['ticker_operation_date'], explode(field_and_values))\
                                  .withColumnRenamed('key', 'ticker_fund')\
                                  .withColumnRenamed('value', 'total_years_price')\
                                  .drop('ticker_operation_date')

In [None]:
portfolio_T.limit(10).toPandas()

In [None]:
portfolio_T.groupBy('total_years_price').count().orderBy('total_years_price').show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------