In [None]:
import pandas as pd
import numpy as np

## Handling null values

In [None]:
df_overview = pd.read_csv('../../data/df_overview.csv', index_col=0)
df_overview.head(5)

In [None]:
print(df_overview.isnull().sum())

#### Replace numpy infinity with nan

In [None]:
df_overview.replace([np.inf, -np.inf], np.nan, inplace=True)

### Market capital 

In [None]:
df_overview['market_capital_euro'].astype(np.float64).astype(str)
df_overview.drop(df_overview[df_overview['market_capital_euro'].isnull() == True].index, inplace=True)
df_overview

### Trailing P/E Ratio

Fill 'trailing_pe' column's null values with the mean of its respective industry 

In [None]:
df_overview['trailing_pe'] = df_overview['trailing_pe'].fillna(df_overview.groupby('industry')['trailing_pe'].transform('mean'))

The rest if any are filled with the overall mean value of all stocks combined

In [None]:
df_overview['trailing_pe'] = df_overview['trailing_pe'].fillna(df_overview['trailing_pe'].notnull().mean())

### Beta

Fill 'Beta' column's null values with the mean of its respective industry 

In [None]:
df_overview['beta'] = df_overview['beta'].fillna(df_overview.groupby('industry')['beta'].transform('mean'))

The rest if any are filled with the overall mean value of all stocks combined

In [None]:
df_overview['beta'] = df_overview['beta'].fillna(df_overview['beta'].notnull().mean())

## Return on equity

Fill 'Beta' column's null values with the mean of its respective industry 

In [None]:
df_overview['return_on_equity'] = df_overview['return_on_equity'].fillna(df_overview.groupby('industry')['return_on_equity'].transform('mean'))

The rest if any are filled with the overall mean value of all stocks combined

In [None]:
df_overview['return_on_equity'] = df_overview['return_on_equity'].fillna(df_overview['return_on_equity'].notnull().mean())

In [None]:
df_overview

## Returns 

In order to make sure that both serial-data and static-data dataframe have the same tickers, we do a two-way intersection between both dataframes. 
Filter out unavailable ticker symbols. Here we do a two-way intersection.

1. From serial-data dataframe we select only the items that are also found in the static-data dataframe.
2. Then from static-data dataframe we select only the items that are also available in the serial-data dataframe.

In [None]:
df_time_series = pd.read_csv('../../data/df_monthly_returns_complete.csv', index_col='Date') 

In [None]:
tickers_valid = df_overview['stock_ticker_symbol'].array
df_time_series = df_time_series[df_time_series.columns.intersection(tickers_valid)]

In [None]:
df_overview = df_overview.loc[df_overview['stock_ticker_symbol'].isin(df_time_series.columns.tolist())]

In [None]:
indices = df_time_series.index

#### Handle Time-Series Null & Zero Values

In [None]:
from sklearn.impute import KNNImputer

# 1. Set missing values to the mean value of 5 nearest neighbors (time-steps) 
imputer = KNNImputer(n_neighbors=5)
df_time_series = pd.DataFrame(imputer.fit_transform(df_time_series), 
                              columns=df_time_series.columns)
#
df_time_series.reset_index(drop=True, inplace=True)
df_time_series.index = indices

# 2- A fallback to 1, when there are still NaN values. Fill NaN values wihe the mean of the 3 rolling (next) items
df_time_series = df_time_series.fillna(df_time_series.rolling(window=3, min_periods=1).mean()) 

# Remove Zeros
def replace_with_rolling_mean(series, window):
    rolling_mean = series.rolling(window=window, min_periods=1).mean()
    series = series.mask(series == 0, rolling_mean)
    return series
df_time_series = df_time_series.apply(lambda col: replace_with_rolling_mean(col, window=2))

In [None]:
df_time_series.rolling(window=3, min_periods=1).mean()

In [None]:
df_time_series

## Save Dataframe

In [None]:
df_overview.to_csv('../../data/df_overview.csv')
df_time_series.to_csv('../../data/df_monthly_returns_complete.csv')