# Statistics and Visualisations

In [1]:
NAME = "Knudsen"
NAME = "Niedermayer"

In [2]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# unpickle the abnormal returns
df_abnormal_returns = pd.read_pickle(f"data/{NAME}/calculate_AR_results/df_abnormal_returns.pkl")



# set plotting sizes
tick_size = 15
label_size = 20
title_size = 30
fig_height = 20

EVENT_INDEX = 20 # because 20 [0,..19] are before the event


investigation_periods = {
    "overall": (pd.Timestamp("2018-01-01"), pd.Timestamp("2021-12-31")),
    "pre-pandemic": (pd.Timestamp("2018-01-01"), pd.Timestamp("2020-02-29")),
    "pandemic": (pd.Timestamp("2020-03-01"), pd.Timestamp("2021-12-31")),
}

with open(f"data/{NAME}/calculate_AR_results/companies.pkl", "rb") as f:
    companies = pickle.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'data/Niedermayer/calculate_AR_results/df_abnormal_returns.pkl'

### Having a look at all trade types together

In [None]:
types_of_interest = ["P - Purchase", "S - Sale", "S - Sale+OE"]
counts = df_abnormal_returns.groupby(level=2).count()[0].rename({0: "N"})
print(f"relevant filings: ",counts[types_of_interest].sum())
print(counts[types_of_interest])
print(f"dropped filings ", counts.sum() - counts[types_of_interest].sum())

In [None]:
a = df_abnormal_returns.loc[:,:,types_of_interest,:].groupby(level=[2]).mean().transpose().plot(figsize=(fig_height, 10))
plt.plot(np.zeros(len(df_abnormal_returns.columns)), color="black", linewidth=0.5)
a.set_title("Mean Abnormal Returns Of Selected Trade Types",fontsize=title_size)
a.set_xlabel("Days", fontsize=label_size)
a.set_ylabel("Mean Abnormal Return", fontsize=label_size)
plt.xticks(fontsize=tick_size)
plt.yticks(fontsize=tick_size)
plt.axvline(x = EVENT_INDEX, color = 'red', label = 'DD Event time', linewidth = 1.5)
plt.savefig(f"data/{NAME}/visualisations/MAR_selected_tradetypes.png", dpi=600, bbox_inches='tight')

In [None]:
a = df_abnormal_returns.loc[:,:,types_of_interest,:].groupby(level=[2]).mean().transpose().cumsum().plot(figsize=(fig_height, 10))
plt.plot(np.zeros(len(df_abnormal_returns.columns)), color="black", linewidth=0.5)
a.set_title("Cumulative Abnormal Returns Of Selected Trade Types",fontsize=title_size)
a.set_xlabel("Days", fontsize=label_size)
a.set_ylabel("Cumulative Abnormal Return", fontsize=label_size)
plt.xticks(fontsize=tick_size)
plt.yticks(fontsize=tick_size)
plt.axvline(x = EVENT_INDEX, color = 'red', label = 'DD Event time', linewidth = 1.5)
plt.savefig(f"data/{NAME}/visualisations/CAR_selected_tradetypes.png", dpi=600, bbox_inches='tight')

In [None]:
a = df_abnormal_returns.groupby(level=[2]).mean().transpose().plot(figsize=(fig_height, 10), fontsize=15)
plt.plot(np.zeros(len(df_abnormal_returns.columns)), color="black", linewidth=0.5)
a.set_title("Abnormal Returns of all Trade Types",fontsize=title_size)
a.set_xlabel("Days", fontsize=label_size)
a.set_ylabel("Mean Abnormal Return", fontsize=label_size)
plt.xticks(fontsize=tick_size)
plt.yticks(fontsize=tick_size)
plt.axvline(x = EVENT_INDEX, color = 'red', label = 'DD Event time', linewidth = 1.5)
plt.savefig(f"data/{NAME}/visualisations/AR_all_tradetypes.png", dpi=600, bbox_inches='tight')

In [None]:
df_abnormal_returns

I think OE and OptEx means option exercise. "to exercise" means to put into effect the right to buy or sell the underlying security that is specified in the options contract." Can we be sure that the action does not shift the market, and the swing in return is really due to new information? Are these trades public, so maybe they are used as a signal for traders?

### Boxplot of the sum over all companies's AR

![alt text](assets/images/time_agg.png)

In our case it is not company i, but filing i

Types of trade to pick from:

In [None]:
ax = df_abnormal_returns.groupby(level=[2]).sum().transpose().plot.box(rot=90, figsize=(fig_height, 10))
ax.set_title("Boxplots of the Abnormal Returns for each Trade Type",fontsize=title_size)
ax.set_xlabel("Trade Type", fontsize=label_size)
ax.set_ylabel("Abnormal Return", fontsize=label_size)

plt.xticks(fontsize=tick_size)
plt.yticks(fontsize=tick_size)

plt.savefig(f"data/{NAME}/visualisations/AR_all_tradetypes_boxplot.png", dpi=600, bbox_inches='tight')

In [None]:
types = list(set([x[2] for x in df_abnormal_returns.index]))
types

Specify the type of the trades to investigate

In [None]:
type_ = "P - Purchase" # "S - Sale"
df_abnormal_returns_type = df_abnormal_returns.loc[:,:,type_]

The index describes the company and the index of the filing in the compnay, the columns represent the days in the event window

In [None]:
df_abnormal_returns_type

In [None]:
df_abnormal_returns_type.sum(axis=0).plot()

In [None]:
CAR = df_abnormal_returns_type.cumsum(axis=0)
CAR.plot()

![alt text](assets/images/cross_sectional_agg.png)


In [None]:
AR_bar = df_abnormal_returns_type.mean(axis=0) 
AR_bar.plot()

In [None]:
CAR_bar = AR_bar.sum()
CAR_bar

In [None]:
# TODO var_CAR_bar = Does the definition make sense? It seems like we take the var of a scalar

![alt text](assets/images/cross_sectional_agg2.png)


In [None]:
CAR_bar_2 = CAR.mean()
CAR_bar_2

In [None]:
# var car TODO not sure what just sigma means.

# Statistics

Check if CAR mean = 0 (t-test)

In [None]:
# unpickle the data
df_eps = pd.read_pickle(f"data/{NAME}/calculate_AR_results/df_eps.pkl")
df_estimation_window_market_return = pd.read_pickle(f"data/{NAME}/calculate_AR_results/df_estimation_window_market_return.pkl")
df_event_window_market_return = pd.read_pickle(f"data/{NAME}/calculate_AR_results/df_event_window_market_return.pkl")

df_eps.sort_index(level=["Company", "i", "TradeType", "event_timestamp"], ascending=True, inplace=True)
df_estimation_window_market_return.sort_index(level=["Company", "i", "TradeType", "event_timestamp"], ascending=True, inplace=True)
df_event_window_market_return.sort_index(level=["Company", "i", "TradeType", "event_timestamp"], ascending=True, inplace=True)


In [None]:
from source.statistical_tests import grank, adjBMP

In [None]:
df_eps

In [None]:
df_abnormal_returns.sort_index(level=["Company", "i", "TradeType", "event_timestamp"], ascending=True, inplace=True)
left, right = investigation_periods["pandemic"]
df_abnormal_returns.loc[:,:,type_, left:right]

In [None]:
test_results = []
test_index = []

for per in investigation_periods.keys():
    for type_ in types_of_interest:
        left, right = investigation_periods[per]
        AR = df_abnormal_returns.loc[:,:,type_, left:right].values
        eps = df_eps.loc[:,:,type_,left:right].values
        R_market_estimation_window = df_estimation_window_market_return.loc[:,:,type_,left:right].values
        R_market_event_window = df_event_window_market_return.loc[:,:,type_,left:right].values
        event_day = 20
        grank_result = grank(AR, eps, R_market_estimation_window, R_market_event_window, event_day)
        adjBMP_result = adjBMP(AR, eps, R_market_estimation_window, R_market_event_window, event_day)
        test_results.append((grank_result.pvalue, adjBMP_result.pvalue))
        test_index.append((per, type_))

In [None]:
test_results_df = pd.DataFrame(test_results, columns=["GRANK", "adj-BMP"], index=pd.MultiIndex.from_tuples(test_index))
print(test_results_df.round(5).to_latex())

In [None]:
tt = stats.ttest_1samp(CAR, popmean=0)
tt

Check if CAR median = 0 (wilcoxon signed rank test)

In [None]:
stats.wilcoxon(CAR)

In [None]:
types = types
investigation_periods = investigation_periods
multiind_p, data_p = [], []

event_day_ranges = {
    "pre-event": (0,20), # TODO for now we drop the filing day itself
    "post-event": (21,41)
}

tests = {"ttest": lambda x: round(stats.ttest_1samp(x, popmean=0).pvalue, 10),
         "wilcoxon": lambda x: round(stats.wilcoxon(x).pvalue,10)}

aggregation_type = {"name": "cross-sectional", "axis":0}
#aggregation_type = {"name": "through time", "axis":1}

for type_ in types:
    for per in investigation_periods.keys():
        for side in event_day_ranges.keys():
                
            event_day_range = event_day_ranges[side]
            df_abnormal_returns_type = df_abnormal_returns.loc[:,:,type_]
            per_left, per_right = investigation_periods[per]


            timestamps = df_abnormal_returns_type.index.get_level_values(2)
            mask = (timestamps >= per_left) & (timestamps <= per_right)
            df_AR_type_per = df_abnormal_returns_type[mask]
            if not len(df_AR_type_per):
                print(f"skipping iteration because of 0 datapoints {(type_, per, side)}")
                continue
            df_AR_type_per_side = df_AR_type_per.iloc[:,event_day_range[0]:event_day_range[1]]
            CAR = df_AR_type_per_side.mean(axis=aggregation_type["axis"]).cumsum()

            pvalue_ttest = tests["ttest"](CAR)
            pvalue_wilcoxon = tests["wilcoxon"](CAR)
            mean = CAR.mean()
            median = CAR.median()
            # calculate a 95% confidence interval
            left, right = stats.t.interval(0.95, len(CAR)-1, loc=np.mean(CAR), scale=stats.sem(CAR))
            left, right = round(left, 4), round(right, 4)
            CI = left, right
            
            multiind_p.append((type_, per, side))
            data_p.append((mean, pvalue_ttest, median, pvalue_wilcoxon, len(df_AR_type_per), CI))

In [None]:
# process abnormal returns
df_p = pd.DataFrame.from_records(data_p)
df_p.index = pd.MultiIndex.from_tuples(multiind_p, names=["TradeType", "Period", "Side"])
df_p.columns = ["mean", "ttest pvalue", "median", "wilcoxon pvalue", "sample_size", "95% CI"]
df_p.to_csv(f"data/{NAME}/{aggregation_type['name']}_tests_result.csv")
df_p.sort_values("ttest pvalue")

In [None]:
df_p.loc["S - Sale"]

#### Export information to latex

In [None]:
counter = 0
for t in types:
    counter = counter+1
    #print(t)
    df_ = df_p.loc[t,:,:]
    df_ = df_.rename(columns={'Side of the event': 'Side'
                              , 'mean': 'mean'
                              , 'ttest pvalue': 'ttest'
                              , 'median': 'median'
                              , 'wilcoxon pvalue': 'Wilcoxon'
                              , 'sample_size': 'N'})
    #display(df_)
    
    latex = df_.drop(columns=["95% CI"])\
    .to_latex( column_format="llrrrrr"
              , position="H"
              , label=f"table:t{counter}_hypothesistest"
              , caption= f"Hypothesis tests for type: {t}"
              #, index = False
    )
    print(latex)

### Visualise CI

In [None]:
# purcase, sale and sale with oe
for t in types_of_interest:
    for side in event_day_ranges.keys():

        df_ = df_p.loc[t,:,side]

        def plot_confidence_interval(x, mean, CI, color='#2187bb', horizontal_line_width=0.25):


            left = x - horizontal_line_width / 2
            top = mean - CI
            right = x + horizontal_line_width / 2
            bottom = mean + CI
            plt.plot([x, x], [top, bottom], color=color)
            plt.plot([left, right], [top, top], color=color)
            plt.plot([left, right], [bottom, bottom], color=color)
            plt.plot(x, mean, 'o', color='#f44336')

            return mean, CI

        plt.figure(figsize=(fig_height,4))

        df_to_plot = df_
        for i, row in enumerate(df_to_plot.sort_values("mean").itertuples()):
            mean, CI = plot_confidence_interval(i, row[1], row[6][1] - row[1])

        plt.plot(np.zeros(len(df_to_plot)), color="black", linewidth=0.5)
        plt.xticks(range(len(df_to_plot)), df_to_plot.index, fontsize=tick_size)
        plt.yticks(fontsize=tick_size)

        plt.title(f'CI of trade type  {t} | {side} abnormal return data ', fontsize=title_size)

        plt.xlabel('Time Frame', fontsize=label_size)
        plt.ylabel('Mean Abnormal Return', fontsize=label_size)

        plt.xticks(fontsize=tick_size)
        plt.yticks(fontsize=tick_size)
        plt.savefig(f"data/{NAME}/visualisations/CI_{t}_{side}.png", dpi=600, bbox_inches='tight')
        plt.show()

In [None]:
multiind_h3, data_h3 = [], []

two_sample_tests = {
    "ttest": lambda x, y: round(stats.ttest_ind(x, y).pvalue, 10),
    "wilcoxon": lambda x, y: round(stats.wilcoxon(x, y).pvalue, 10)
}

for test_name in two_sample_tests.keys():
    for t in types_of_interest:
        d = []
        for side in event_day_ranges.keys():

            event_day_range = event_day_ranges[side]
            df_abnormal_returns_type = df_abnormal_returns.loc[:,:,t]

            timestamps = df_abnormal_returns_type.index.get_level_values(2)

            df_AR_type_side = df_abnormal_returns_type.iloc[:,event_day_range[0]:event_day_range[1]]


            pand_start, pand_end = investigation_periods["pandemic"][0], investigation_periods["pandemic"][1]
            prepand_start, prepand_end = investigation_periods["pre-pandemic"][0], investigation_periods["pre-pandemic"][1]

            mask_pand = (timestamps >= pand_start) & (timestamps <= pand_end)
            mask_prepand = (timestamps >= prepand_start) & (timestamps <= prepand_end)

            df_AR_type_side_pand_agg = df_AR_type_side[mask_pand].mean(axis=0).cumsum()
            df_AR_type_side_prepand_agg = df_AR_type_side[mask_prepand].mean(axis=0).cumsum()
            plt.hist(df_AR_type_side_pand_agg)
            plt.hist(df_AR_type_side_prepand_agg)
            plt.show()

            pvalue = two_sample_tests[test_name](df_AR_type_side_prepand_agg, df_AR_type_side_pand_agg)
            print(f"For type {t}, in the days {side} of the event, the pre-pandemic and pandemic mean of abnormal returns is different with a p-value of {pvalue}")

            d.append(pvalue)
        multiind_h3.append((test_name, t))
        data_h3.append(d)

### P-values of whether pre-pandemic and pandemic are significantly different for the timeframe before and after the event and different types

In [None]:
df_h3 = pd.DataFrame(data_h3, columns=event_day_ranges.keys())
df_h3.index=pd.MultiIndex.from_tuples(multiind_h3, names=["Test", "Type"])
df_h3

In [None]:
print(df_h3.to_latex())

Specify the type of the trades to investigate

### Visualize how active Directors Dealings Are
Should move this to somewhere else
Want visual confirmation that Directors have changed behaviour during the times of Covid (Hypothesis 3)

In [None]:
all_insider_trades = companies[0].insider_data_df.head(0)
from tqdm import tqdm
#for j in tqdm(range(len(companies[:200]))):
for j in tqdm(range(len(companies))):
    all_insider_trades = pd.concat([all_insider_trades, companies[j].insider_data_df])#  @ gunnar very nice asymptotic runtime

In [None]:
all_insider_trades['FilingDateTrunc'] = all_insider_trades['FilingDate'].dt.date
all_insider_trades.groupby('FilingDateTrunc')['FilingDate'].count().plot()

In [None]:
all_insider_trades.to_csv('AllTradesForExcelGSK.csv')

In [None]:
all_insider_trades.groupby('FilingDateTrunc')['Value'].sum().plot()

In [None]:
all_insider_trades.groupby('FilingDateTrunc')['Value'].mean().plot()

In [None]:
plt.hist(all_insider_trades['FilingDateTrunc'])