In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import chain
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import date_format, to_date, col, year, month, dayofmonth, sum as spark_sum, when, create_map, lit, explode, udf

In [None]:
pd.set_option('display.max_columns', 10000000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.width', 10000000)

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining Spark Session for pseudo-distributed computing:

In [None]:
spark = SparkSession.builder.appName('Exploratory_Analysis').getOrCreate()
sc = spark.sparkContext
sc

# Loading CSV daily price Funds file.

In [None]:
portfolio_path_file = 'data-resources/data.csv'
portfolio_data = spark.read.format("csv").options(header="true").load(portfolio_path_file)

In [None]:
portfolio_data.limit(5).toPandas()

# Change impure schema portfolio input data.
### Defining portfolio dataframe data:

In [None]:
schema_portfolio = [date_format(
    to_date(col(portfolio_data.columns[0]), 'dd/MM/yyyy'),
    'yyyy-MM-dd').cast('date').alias('operation_date')] + [col(x).cast('float') for x in portfolio_data.columns[1:]]

### Filtering operation dates without nulls:

In [None]:
portfolio_data_ns = portfolio_data.where(col(portfolio_data.columns[0]).isNotNull())\
                                  .select(schema_portfolio)

portfolio_data_ns.printSchema()

In [None]:
#partition_field_mod1 = ['operation_date']
#writing_path_mod1 = '/data/core/fince/data/portfolioOptimization/price_wharehouse_transform/'
#print('\nWriting parquets ...\n')
#portfolio_data_ns.repartition(1).write.mode('overwrite').parquet(writing_path_mod1, partitionBy=partition_field_mod1)

#%time
#print('\nSUCCESS \nPARQUET DATA SAVED!')
#print('\nNew root path table data:', writing_path_mod1+'operation_date=yyy-MM-dd', '\nparquet chunks portitioned by:', partition_field_mod1)

portfolio_path_parquet = '/data/core/fince/data/portfolioOptimization/price_wharehouse_transform/'
portfolio_df = spark.read.parquet(portfolio_path_parquet)

### Year parameters input array:

In [None]:
year_param_1, year_param_2 = 2016, 2019
year_array = list(range(year_param_1, year_param_2+1))
print('Year filter array parameters:', year_array)

In [None]:
portfolio_dates = portfolio_df.select('*', year("operation_date").alias('year'), 
                                          month("operation_date").alias('month'), 
                                     dayofmonth("operation_date").alias('day'))\
                              .orderBy("operation_date")

In [None]:
portfolio_dates.limit(5).toPandas()

# Exploring dataframe portfolio funds data.

In [None]:
count_tickers_agg = [spark_sum(when(col(x).isNotNull(), 1).otherwise(0)).alias('count_' + str(x)) for x in portfolio_dates.columns[1:-3]]
portfolio_dates_agg = portfolio_dates.groupBy('year')\
                                     .agg(*count_tickers_agg)

In [None]:
count_by_year = [spark_sum(when(col(x) > 0, 1).otherwise(0)).alias(str(x[6:])) for x in portfolio_dates_agg.columns[1:]]

In [None]:
portfolio_year_count = portfolio_dates_agg.where(col('year').isin(*year_array)).select(*count_by_year)

In [None]:
field_and_values = create_map(list(chain.from_iterable([[lit(c), col(c)] for c in portfolio_year_count.columns[:-1]])))

In [None]:
portfolio_T = portfolio_year_count.select(*['operation_date'], explode(field_and_values))\
                                  .withColumnRenamed('key', 'ticker_fund')\
                                  .withColumnRenamed('value', 'total_years_price')

In [None]:
portfolio_T.groupBy('total_years_price').count().orderBy('total_years_price').show()

In [None]:
historical_fields = portfolio_T.where(col("total_years_price") >= 4).select("ticker_fund").cache()

In [None]:
df_set = historical_fields.collect()
new_array = []
for x in range(len(df_set)):
    new_array.append(df_set[x][0])

In [None]:
%matplotlib inline
X = portfolio_dates.select(new_array).toPandas()
X.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
plt.figure(figsize=(12,10))
cor = X.corr()
mask = np.tril(cor)
sns.heatmap(cor, vmin=-1, vmax=1, mask=mask, cmap= 'coolwarm')
plt.show()

In [None]:
def plot_series(series, w=15, h=5):
    fig = plt.gcf()
    fig.set_size_inches(w=w, h=h)
    plt.plot(series)
    plt.show()

In [None]:
feature = "GBMGUBL"
spark_collection_1 = portfolio_dates.select(feature).where(col(feature).isNotNull()).collect()
collection_1 = []
for row in range(len(spark_collection_1)):
    collection_1.append(spark_collection_1[row][0])
    
print("\nseries:", feature)
plot_series(collection_1, 20, 5)

In [None]:
def logarithm_scale(real_number):
    """
    Logarithm Scale method transforms to natural logarithm value.
    :param real_number: float type input value
    :return: FloatType value
    """
    try:
        log_number = math.log(float(real_number))
    except Exception:
        log_number = float(0)

    return log_number

In [None]:
logarithmic_scale_udf = udf(logarithm_scale, DoubleType())
scaled_feature = feature_analysis.select(*[logarithmic_scale_udf(col(c)).alias("log_"+c) for c in feature_analysis.columns])

In [None]:
log_feature = "log_TASA"
spark_collection_2 = scaled_feature.select(log_feature).collect()
collection_2 = []
for row in range(len(spark_collection_2)):
    collection_2.append(spark_collection_2[row][0])

print("\nseries:", log_feature)
plot_series(collection_2, 20, 5)

In [None]:
%matplotlib inline
X = scaled_feature.toPandas()
X.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
def class_mark(observation, min_val, class_length, bins):
    """
    Computes the corresponding bin to a certain data observation given the data set minimum, size, bins
    and the class length
    :param observation: float, value of which is required to obtain your bin number
    :param min_val: float, minimum value observed in the rdd
    :param class_length: float, length of each sub interval
    :param bins: int, number of sub intervals
    :return: int, bin corresponding to the given observation
    """
    interval = int((observation - min_val) / class_length)
    if interval >= bins:
        return bins - 1
    else:
        return interval

In [None]:
def frequency_rdd_continuous(data_set_rdd, min_val, class_length, bins, n):
    """
    Generates the frequency table rdd from certain continuous column rdd

    :param data_set_rdd: rdd, rdd of the continuous column of which the histogram will be computed
    :param min_val: float, minimum value observed in the rdd
    :param class_length: float, length of each sub interval
    :param bins: int, number of sub intervals
    :param n: int, table length
    :return: rdd, rdd containing the frequencies for each class of the histogram
    """
    frequency_rdd = data_set_rdd \
        .map(lambda x: (class_mark(x, min_val, class_length, bins), 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda y: (y[0]+1, min_val+class_length*y[0], min_val+class_length*(y[0]+1), y[1], y[1]/n))
    return frequency_rdd

In [None]:
def frequency_table_continuous(data_set_df, column, bins=None, suffix=''):
    """
    Computes the histogram frequency table from a column with continuous values for a table Dataframe

    :param data_set_df: Dataframe, table of which it is required to calculate the frequency histogram of some of
    its columns
    :param column: string, column with continuous values which is required to calculate its histogram
    :param bins: int, number of sub intervals
    :param suffix: string, assign the suffix to each column of the frequency table
    :return: Dataframe with the histogram frequency table
    """
    freq_schema = ['bin', 'lower_limit'+suffix, 'upper_limit'+suffix, 'fa_'+column+suffix, 'f_'+column+suffix]
    window_freq = Window.orderBy('bin').rangeBetween(Window.unboundedPreceding, 0)

    cumulative_rel_freq = spark_sum('f_'+column+suffix)\
        .over(window_freq)\
        .alias('F_cumulative_'+column+suffix)

    cumulative_abs_freq = spark_sum('fa_'+column+suffix)\
        .over(window_freq)\
        .alias('Fa_cumulative_'+column+suffix)

    data_set_rdd = data_set_df.select(column).rdd.map(lambda row: (row[0]))
    n = data_set_rdd.count()

    if bins is None:
        bins = 1 + int(3.322 * np.log(n))

    maximum = data_set_rdd.max()
    minimum = data_set_rdd.min()
    class_length = (maximum - minimum) / bins

    frequency_table_df = frequency_rdd_continuous(data_set_rdd, minimum, class_length, bins, n)\
        .toDF(freq_schema)\
        .select('*', cumulative_abs_freq, cumulative_rel_freq)
    return frequency_table_df

In [None]:
feature_analysis = portfolio_dates.select(new_array).where(col("TASA").isNotNull())
frequency_table = frequency_table_continuous(data_set_df=feature_analysis, column="TASA")
frequency_table.show(5)

In [None]:
col_plot = "F_cumulative_TASA"
to_pandas_df = frequency_table.toPandas()
to_pandas_df.hist()

In [None]:
log_frequency_table = frequency_table_continuous(data_set_df=scaled_feature, column="log_TASA")
log_frequency_table.show(5)

In [None]:
col_plot = "F_cumulative_log_TASA"
to_pandas_df = log_frequency_table.toPandas()
to_pandas_df.hist()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------