In [484]:
import pandas as pd
import math
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.stattools import durbin_watson
import datetime
import calendar
from scipy import stats
from matplotlib import pyplot as plt
pd.options.mode.chained_assignment = 'raise'

exclusion_factor = 10
liquidity_variable = "amihud"

In [485]:
def add_month(date): # function that adds one month with the correct year/days.
    if date.month % 12 == 0:
        cur_month = 1
        cur_year = date.year + 1
        cur_day = calendar.monthrange(cur_year, cur_month)
        return datetime.date(cur_year, cur_month, cur_day[1])
    else:
        cur_month = date.month + 1
        cur_day = calendar.monthrange(date.year, cur_month)
        return datetime.date(date.year, cur_month, cur_day[1])

def subtract_year(date):
    prev_year = date.year - 1
    prev_day = calendar.monthrange(prev_year, date.month)
    return datetime.date(prev_year, date.month, prev_day[1])

In [486]:
df = pd.read_csv("market_info_forge_monthly.csv")
df.date = pd.to_datetime(df.date)
df.index = df.date # this line and following ensures we have the date as the index AND as a column
df.date = df.index

In [487]:
start_date = datetime.date(2011,2,28)
date = start_date

d = {}
std_cols = {}

exclusions = 0
# TRYING TO EXCLUDE MONTHS FROM THE ORIGINAL DATASET THAT HAVE COLUMNS WHICH EXHIBIT WEIRD PATTERNS (EXTREME STD)
for i in range(0, 10000):
    #if not date == datetime.date(2011,2,28):
    if not date.year == 2018:
        dfd = df.loc[df["date"] == pd.Timestamp(date)]
        dfd = dfd[["retm", "amihud", "cpqs", "ln_voli", "r100", "r100yr", "stdy"]]

        for c in dfd.columns:
            if i == 0:
                std_cols[c] = [np.std(dfd[c])]
            else:
                std_cols[c].append(np.std(dfd[c]))

        date = add_month(date)
    else:
        break

In [488]:
for key, val in std_cols.items():
    std_cols[key] = np.nanmean(std_cols[key])

In [489]:
start_date = datetime.date(2011,2,28)
date = start_date

for i in range(0, 10000):
    #if not date == datetime.date(2011,2,28):
    if not date.year == 2018:
        dfd = df.loc[df["date"] == pd.Timestamp(date)]
        dfd = dfd[["retm", "amihud", "cpqs", "ln_voli", "r100", "r100yr", "stdy"]]
        
        for c in dfd.columns:
            if np.std(dfd[c]) > exclusion_factor * std_cols[c]:
                exclusions += 1
                df.drop(pd.Timestamp(date), inplace=True)
                break
        
        date = add_month(date)
    else:
        break
print(exclusions)

0


In [490]:
start_date = datetime.date(2012,2,28) # Reset the start date so that it begins at the correct date again
date = start_date

regressions = 0
r2tot = 0
dbstat = []

# run cross-sectional OLS regression for each month and save results in a dictionary. 
for i in range(0,10000):
    #if not date == datetime.date(2011,2,28):
    if not date.year == 2018:   
        dft = df.loc[df["date"] == pd.Timestamp(date)] # dependent variable at time t
        dft_1 = df.loc[df["date"] == pd.Timestamp(subtract_year(date))] # independent variables at time t-1
        dfd = pd.merge(dft[["retm", "typeID"]], dft_1[["amihud", "cpqs", "ln_voli", "r100", "r100yr", "stdy", "typeID"]], on = "typeID")

        # Exclude top and bottom 5% of  most and least liquid items, respectively.
        exclude = math.ceil(len(dfd)*0.05)
        dfd = dfd.sort_values(by = [liquidity_variable])
        dfd = dfd.iloc[exclude:-exclude]
        
        if not dfd.empty:
            model = smf.ols('retm ~ ' + liquidity_variable + ' + r100 + r100yr + stdy', data = dfd, missing = "drop").fit()
            
            for j, name in enumerate(model.params.index):
                if i == 1:
                    d[name] = [model.params[j]]
                else:
                    d[name].append(model.params[j])
                
          # dbstat.append(np.mean(model.resid))
            
            regressions += 1
            r2tot += model.rsquared
            
            date = add_month(date)

        else:
            date = add_month(date)
    else:

        break
        

# print("serial correlation stat: {}".format(durbin_watson(dbstat)))

for k in d.keys():
    print("{} \t {} \t {}".format(k[0:5], round(np.mean(d[k]), 4), round(stats.ttest_1samp(d[k], 0)[0], 2)))
print("R2 {}".format(round(r2tot/regressions, 4)))

Inter 	 0.0127 	 1.76
stdy 	 0.0001 	 2.35
r100y 	 -0.0116 	 -1.56
amihu 	 -1147.6697 	 -1.37
r100 	 0.0069 	 0.62
R2 0.2081
