In [None]:
import pandas as pd
from pyspark.sql.functions import col, count as spark_count, avg as spark_avg

pd.set_option('display.max_columns', 10000000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.width', 10000000)

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Iniciamos Sesión con Ophelia

In [None]:
from enquire.vendetta import Ophelia

In [None]:
ophelia = Ophelia()

# Loading CSV daily price Funds file.

In [None]:
def read_portfolio_data(path_file, source, date_col, withSchema=True):
    spark = ophelia.spark
    portfolio_path_file = path_file
    portfolio_data = ophelia.mazterize.read_file(path_file, source, spark)
    if withSchema == True:
        return ophelia.mazterize.schema_define(portfolio_data, date_col)
    return portfolio_data

## Change impure schema portfolio input data.

In [None]:
portfolio_df = read_portfolio_data(
    path_file="data-resources/raw/csv/data.csv",
    source="csv",
    date_col="operation_date",
)

portfolio_df.limit(5).toPandas()

## Defining Year parameters input array:

# Cleaning data, analytic base table structuration.

In [None]:
def portfolio_date_window(df, from_year, to_year, col_date):
    year_array = ophelia.arrays.year_array(from_year, to_year)
    split_dates = ophelia.dataframe.split_date_columns(df, col_date)
    operation_dates_list = ophelia.arrays.sorted_date_list(df, col_date)
    date_index_udf = ophelia.arrays.dates_index(operation_dates_list)
    portfolio_dates = split_dates.where(col(col_date+"_year").isin(year_array))\
                                 .select('*', (date_index_udf(col(col_date))).alias(col_date[:9]+"_id"))
    return portfolio_dates

In [None]:
portfolio_window_df = portfolio_date_window(
    df=portfolio_df, 
    from_year="2016", 
    to_year="2019", 
    col_date="operation_date"
)

In [None]:
def monitoring_empty_vector(df, feature_type):
    float_cols = ophelia.arrays.feature_picking(df)[str(feature_type)]
    count_by_col = [spark_count(col(x)).alias(str(x)) for x in float_cols]
    aggregate_columns = df.select(*count_by_col)
    return aggregate_columns

In [None]:
def debug_null(panel, missing_days, N):
    null_count = panel.select([col(c).alias(c) for c in panel.columns]).collect()[0].asDict()
    clean_null_list = [k for k, v in null_count.items() if v < abs(missing_days - N)]
    return clean_null_list

In [None]:
def debug_empty_vector(df, feature_type, missing_days=10):
    sample_count = df.count()
    empty_panel = monitoring_empty_vector(df, feature_type)
    clean_null_list = debug_null(empty_panel, missing_days, sample_count)
    debug_vector = df.drop(*clean_null_list)
    return debug_vector

In [None]:
remove_none_df = debug_empty_vector(portfolio_window_df, feature_type="float")
remove_none_df.limit(5).toPandas()

In [None]:
def mean_impute(df):
    float_cols = ophelia.arrays.feature_picking(df)["float"]
    numerical_fields = df.agg(*(spark_avg(c).alias(c) for c in df.columns if c in float_cols))
    portfolio_base_table = df.na.fill(numerical_fields.collect()[0].asDict())
    return portfolio_base_table

In [None]:
portfolio_base_table = mean_impute(remove_none_df)
portfolio_base_table.orderBy(col("operation_date").desc()).limit(5).toPandas()

In [None]:
def join_price_lag(df, on="operation_index", how="left"):
    portfolio_indexed = ophelia.rdd.row_indexing(df)
    lag_portfolio_df = ophelia.dataframe.lag_min_max_data(df)
    lag_portfolio_indexed = ophelia.rdd.row_indexing(lag_portfolio_df)
    join_indexed = portfolio_indexed.join(lag_portfolio_indexed, on=on, how=how)
    return join_indexed

In [None]:
join_price_lag = join_price_lag(portfolio_base_table)
join_price_lag.limit(5).toPandas()

In [None]:
def price_yield(df, fix_cols):
    counter_count = 1
    float_cols = ophelia.arrays.feature_picking(portfolio_base_table)["float"]
    price_yield = df.select(*fix_cols, 
                            *[((col(c) / col("{0}_lag".format(c)) - counter_count)).alias("{0}_yield".format(c)) for c in float_cols]).na.fill(0)
    return price_yield

In [None]:
fix_cols = [
    "operation_index", 
    "operation_id", 
    "operation_date", 
    "operation_date_year",
    "operation_date_month",
    "operation_date_day"
]
portfolio_yield_df = price_yield(df=join_price_lag, fix_cols=fix_cols).orderBy(col("operation_index"))
portfolio_yield_df.limit(5).toPandas()

# Writing Portfolio's Yield dataframe.

In [None]:
price_path = ophelia.mazterize.write_parquet(
    dataframe=portfolio_yield_df, 
    name_directory="portfolio_yield_window", 
    partition_field="operation_date"
)

In [None]:
def read_yield(parquet_path, source="parquet"):
    spark = ophelia.spark
    yield_df = ophelia.mazterize.read_file(path_source=parquet_path, source=source, spark_session=spark)
    return yield_df

In [None]:
yield_porfolio_df = read_yield(parquet_path=price_path)
yield_porfolio_df.orderBy(col("operation_date").desc()).limit(5).toPandas()