In [None]:
import numpy as np
import pandas as pd
import math
import sklearn
import sklearn.preprocessing
import datetime
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
import seaborn as sns 
sns.set()
%matplotlib inline
from collections import Counter
import warnings
warnings.simplefilter(action='ignore', category=Warning)
import seaborn as sns
sns.set()
pd.options.mode.chained_assignment = None  # default='warn'

from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Read in the cleansed and winsorised data
df = pd.read_csv(r".\ger_factor_data_from2003.csv", dtype ={"comp_tpci": str}, parse_dates =["eom"])
#Convert to float 32 (format needed for the most ML models)
df[df.columns[2:]] = df[df.columns[2:]].astype('float32')
#Sort observations by date and stock id
df = df.sort_values(by = ['DATE', 'permno'], ascending = True)
df.head()

In [None]:
# The observation period contains 204 months in the period from 01.2003 to 12.2019'
# The last year is not uesed since we dont have the lead excess return  "ret_exc_lead1m" for January 2021 and we
# use 1 year for testing
df_2 = df[~(df['DATE'] >= '1957-03-31')]
print("Number of months: ", df_2["DATE"].nunique())
print("Start: ", df_2["DATE"].min())
print("End: ", df_2["DATE"].max())

In [None]:
#number of unique stocks
print("Number of unique stcocks: ", df_2["permno"].nunique())

In [None]:
#Inspect variable types
df_2.info(verbose=True)

In [None]:
#Number of stocks at each month; 
Number_stocks_per_month = Counter(df_2['DATE'])
Number_stocks_per_month= pd.DataFrame(Number_stocks_per_month.items(), columns=['Date', 'Numer_of_Stocks'])
print("Minimum number of stocks pre month: ", Number_stocks_per_month["Numer_of_Stocks"].min())
print("Maximum number of stocks pre month: ", Number_stocks_per_month["Numer_of_Stocks"].max())
print("Mean number of stocks pre month: ", Number_stocks_per_month["Numer_of_Stocks"].mean())
Number_stocks_per_month.plot(x='Date', y='Numer_of_Stocks',grid=True,figsize=(15, 7));

### Descriptive Statistics of the chracteristics


#We are interested in the descriptive statistics of all 49 stock level characteristics and the target variable (ret_exc_lead1m)
features = df.columns[~df.columns.isin(['id',"prc","eom","ret", "ret_exc"])].tolist()

In [None]:
descriptive_statistics = df_2[features].describe()
descriptive_statistics

### Distribution of each feature

In [None]:

fig, ax = plt.subplots()
fig.set_figheight(30)
fig.set_figwidth(30)
df[features].hist(layout=(-1, 3), bins=np.linspace(-1,1,50), ax=ax);

### Distribution of portfolio returns

In [None]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
df_3 = df_2[["ret"]]
df_3 = df_3.rename(columns={'ret': 'Stock return'})
sns.histplot(data = df_3, x ="Stock return", binwidth = 0.01, binrange = (df_3["Stock return"].min()+ 0.00000000001, df_3["Stock return"].max()- 0.01))
plt.savefig('returns_distrubution.png')

### Distribution of firm size

In [None]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 10
sns.histplot(data = df_2, x ="me", binrange = (0,15000))
plt.savefig('size_distrubution.png')

### Equally weighted portfolio monthly returns over time

In [None]:
#retain only return data over time
ret_data = df_2[["ret", "permno", "DATE"]]
#create equal weights
ret_data["eq_weights"] = 1/ret_data.groupby('DATE')["permno"].transform('size')
#calculate montly return of the equally weighted portfolio contaning all stocks
ret_data['return_stock_ew'] = ret_data["ret"]*ret_data["eq_weights"]
ret_data['Returns'] = ret_data.groupby('eom')["return_stock_ew"].transform('sum')
#drop duplicates 
ret_data = ret_data[["eom", "Returns"]].drop_duplicates()
#rename eon (end of month) to "date"
ret_data = ret_data.rename(columns={'DATE': 'Date'})
ret_data = ret_data.set_index("Date")
#plot the portfolio returns over time
ret_data.plot(grid=True,figsize=(15, 7));
#save grapgh
plt.savefig('portfolio monthly returns over time.png')

### Portfolio autocorrelation graph

In [None]:

plot_acf(ret_data)
# Show the data as a plot (via matplotlib)
plt.show()
plt.savefig('Portfolio autocorrelation graph.png')

### Variable Correlation Heatmap

In [None]:
features = df.columns[~df.columns.isin(['permno',"prc","DATE","ret","ret_exc"])].tolist()
plt.figure(figsize = (18,18))
sns.heatmap(data=df_2[features].corr())
plt.title('Correlation Heatmap')
plt.savefig('Correlation Heatmap.png')
plt.show()
plt.gcf().clear()

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
c = df_2[features].corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort").reset_index()
so.columns = ['Variable 1','Variable 2', 'corr']
so = so.sort_values(by = ['corr', 'Variable 1'], ascending = False)
so = so[so['corr']!=1]
so = so.iloc[::2].reset_index(drop=True)
so

In [None]:

corr_df.to_excel(r'\Data\EDA\target_correlation.xlsx')