# Exploratory Data analysis
_Written by Thomas Niedermayer and Gunnar Sjúrðarson Knudsen, as a conjoined effort for an interdiscplinary project in Data Science._
* Supervisor: Wolfgang Aussenegg
* Co-Supervisor: Sascha Hunold

Purpose of this notebook is to understand the data quality, and the scale of the task. 
As we ran into several issues w.r.t. data quality, we want to get a deeper understanding of how which data is being handled. Example errors:
* TICKERS being used for multiple companies/ISINS, resulting in not knowing which ISIN an insider trading corresponds to
* Missing ISINs; means that no time series data is available
* Missing TICKERS; Means that no insider trades are available.
* For several companies, there are no insider trades registered. These are therefore also filtered out from the analysis notebook. These wouldn't give an error, but removing them speeds up the runtime
* Sometimes there was no return index data available.

## Load libraries

In [None]:
import pickle
import pandas as pd
import numpy as np
import os
import math
from tools import load_settings, display_table
from IPython.display import clear_output, display
from tqdm import tqdm

# Load custom libraries
import source.read_tickers_and_isins as URTI

## Define which data to be loaded

In [None]:

settings = load_settings()
NAME = settings["NAME"]
STOCK_EXCHANGE = settings["STOCK_EXCHANGE"]

In [None]:
DATA_LOCATION = f'data/{NAME}/'
_insider_location = DATA_LOCATION + 'processed/insider/'

## Read in input data

In [None]:
INPUT_FILE = f'input_data/{NAME}/{STOCK_EXCHANGE} Composite 16.3.2022 plus dead firms - {NAME}.xlsx'

In [None]:
data = URTI.read_tickers_and_isins(INPUT_FILE)

In [None]:
data

## Read in scraped data
We need this, so we can also exclude handling where we didn't get any data from whereever

### Insider trades

In [None]:
# Define dummy placeholders
tickers = []
trade_counts = []
min_filing_date = []
max_filing_date = []
min_trade_date = []
max_trade_date = []
n_distinct_traders = []
n_distinct_trade_types = []

n_p = []
n_s = []
n_s2 = []
n_a = []
n_d = []
n_g = []
n_f = []
n_m = []
n_x = []
n_c = []
n_w = []

# helpers
counter = 0
total_count = len(data['TICKER SYMBOL'])

# Read in scraped files, and do various aggregations
for ticker in data['TICKER SYMBOL']:
    counter = counter +1
    clear_output(wait=True)
    print(f'Handling {counter} of {total_count}. Currently doing: {ticker}')
    
    dat = pd.read_csv(_insider_location + ticker + '.csv', index_col=0, parse_dates=['FilingDate', 'TradeDate'])

    tickers.append(ticker)
    trade_counts.append(dat.shape[0])

    min_filing_date.append(dat['FilingDate'].min())
    max_filing_date.append(dat['FilingDate'].max())

    min_trade_date.append(dat['TradeDate'].min())
    max_trade_date.append(dat['TradeDate'].max())

    n_distinct_traders.append(dat['InsiderName'].nunique())
    n_distinct_trade_types.append(dat['TradeType'].nunique())


    n_p.append(sum(dat['TradeType'] == 'P - Purchase'))
    n_s.append(sum(dat['TradeType'] == 'S - Sale'))
    n_s2.append(sum(dat['TradeType'] == 'S - Sale+OE'))

    n_a.append(sum(dat['TradeType'] == 'A - Grant'))
    n_d.append(sum(dat['TradeType'] == 'D - Sale to Iss') + sum(dat['TradeType'] == 'D - Sale to issuer'))
    n_g.append(sum(dat['TradeType'] == 'G - Gift'))
    n_f.append(sum(dat['TradeType'] == 'F - Tax'))
    n_m.append(sum(dat['TradeType'] == 'M - Option Ex') + sum(dat['TradeType'] == 'M - OptEx'))
    n_x.append(sum(dat['TradeType'] == 'X - Option Ex') + sum(dat['TradeType'] == 'X - OptEx'))
    n_c.append(sum(dat['TradeType'] == 'C - Cnv Deriv') + sum(dat['TradeType'] == 'C - Converted deriv'))
    n_w.append(sum(dat['TradeType'] == 'W - Inherited'))
    
# Collect to a single data frame
scraped_insider_df = pd.DataFrame({'tickers': tickers
                                   , 'trade_count': trade_counts
                                   , 'min_filing_date': min_filing_date
                                   , 'max_filing_date': max_filing_date
                                   , 'min_trade_date': min_trade_date
                                   , 'max_trade_date': max_trade_date
                                   , 'n_distinct_traders': n_distinct_traders
                                   , 'n_distinct_trade_types': n_distinct_trade_types
                                   , 'P - Purchase (count)': n_p
                                   , 'S - Sale (count)': n_s
                                   , 'S - Sale+OE': n_s2
                                   , 'A - Grant (count)': n_a
                                   , 'D - Sale to Iss (count)': n_d
                                   , 'G - Gift (count)': n_g
                                   , 'F - Tax (count)': n_f
                                   , 'M - Option Ex (count)': n_m
                                   , 'X - Option Ex (count)': n_x
                                   , 'C - Cnv Deriv (count)': n_c
                                   , 'W - Inherited (count)': n_w
                   })
scraped_insider_df
scraped_insider_df = scraped_insider_df.drop_duplicates()

#### Join the data

In [None]:
data = pd.merge(data, scraped_insider_df, how='left', left_on = 'TICKER SYMBOL', right_on = 'tickers')

### Read in market timeseries

In [None]:
DATA_LOCATION_RI = DATA_LOCATION + 'processed/RI_discard/'
_ri_location = DATA_LOCATION_RI

file_locs_ = os.listdir(_ri_location)
file_locs = [_ri_location + f for f in file_locs_]

# Actually read in the company information
companies = []
print("loading return series...")
for file_loc in tqdm(file_locs):
    with open(file_loc, "rb") as f:
        company = pickle.load(f)
    companies.append(company)

In [None]:
isins = []
names = []
tickers = []
start_dates = []
end_dates = []
start_dates_ts = []
end_dates_ts = []
ts_rows = []
ri_ts_errors = []
filings_rows = []
# Do checks for each company
for company in companies:
    isins.append(company.isin)
    names.append(company.name)
    tickers.append(company.ticker)
    start_dates.append(company.start_date)
    end_dates.append(company.end_date)
    start_dates_ts.append(company.return_index_df.index.min())
    end_dates_ts.append(company.return_index_df.index.max())
    ts_rows.append(company.return_index_df.shape[0])
    filings_rows.append(company.insider_data_df.shape[0])
    
    # In some cases, the RI is the same for all days in a company, followed by missing days.
    ts_ri_sum = company.return_index_df[1:].company_return.sum() 
    
    # Add check to see if there is a change in price at all
    ts_ri_diff = company.return_index_df[1:].company_return.min() - company.return_index_df[1:].company_return.max()

    if (ts_ri_sum == np.Inf):
        contains_error_in_timeseries = True
    elif (-ts_ri_sum == np.Inf):
        contains_error_in_timeseries = True
    elif (math.isnan(ts_ri_sum)):
        contains_error_in_timeseries = True
    elif (math.isnan(ts_ri_diff)):
        contains_error_in_timeseries = True
    elif (ts_ri_diff == 0):
        contains_error_in_timeseries = True
    elif (company.return_index_df[1:].company_return.isnull().any() == True):
        contains_error_in_timeseries = True
    else:
        contains_error_in_timeseries = False
    ri_ts_errors.append(contains_error_in_timeseries)
    
    if (contains_error_in_timeseries):
        print(f'{company.ticker}: {ts_ri_sum} {contains_error_in_timeseries}')
    
        
# Collect to a single data frame
scraped_ts_df = pd.DataFrame({'isin': isins
                              , 'ts_rows': ts_rows
                              , 'name': names
                              , 'ticker': tickers
                              , 'start_date': start_dates
                              , 'end_date': end_dates
                              , 'start_date_ts': start_dates_ts
                              , 'end_date_ts': end_dates_ts
                              , 'RI_Errors': ri_ts_errors
                              , "filings_rows": filings_rows
                             })
scraped_ts_df = scraped_ts_df.drop_duplicates()
scraped_ts_df#[scraped_ts_df['ticker'] == 'ABIO']

#### Join the data

In [None]:
#data = data.join(scraped_ts_df, rsuffix='_given', lsuffix='_ts', how="left", left_on = 'ISIN CODE', right_on = 'isin')
data = pd.merge(data, scraped_ts_df, how="left", left_on = 'ISIN CODE', right_on = 'isin')

### Start filtering

In [None]:
data['reason_to_exclude'] = 'None'

#### Remove Companies without ISINs

In [None]:
mask = data['ISIN CODE'] == 'NA'
#data.loc[mask, 'reason_to_exclude'] = 'NA ticker'
data.loc[mask, 'reason_to_exclude'] = 'Missing ISIN'
data.loc[mask].shape

#### Remove companies without Trades

In [None]:
mask = data['trade_count'] == 0
#data.loc[mask, 'reason_to_exclude'] = 'NA ticker'
data.loc[mask, 'reason_to_exclude'] = 'No trades done'
data.loc[mask].shape

#### Remove companies without Timeseries

In [None]:
mask = data['ts_rows'].isnull()
data.loc[mask, 'reason_to_exclude'] = 'No timeseries data'
data.loc[mask].shape

#### Remove companies where company time-series from source if wrong

In [None]:
mask = data['RI_Errors'] == True
data.loc[mask, 'reason_to_exclude'] = 'Faulty timeseries data'
data.loc[mask].shape

#### Find non-unique tickers

In [None]:
duplicate_tickers = data[data.duplicated(subset=['TICKER SYMBOL'],keep=False)]['TICKER SYMBOL']
duplicate_tickers_mask = data['TICKER SYMBOL'].isin(duplicate_tickers)
data.loc[duplicate_tickers_mask, 'reason_to_exclude'] = 'Non-unique-ticker'
data.loc[duplicate_tickers_mask].shape

#### Find NA tickers

In [None]:
mask = data['TICKER SYMBOL'] == 'NA'
data.loc[mask, 'reason_to_exclude'] = 'NA ticker'
data.loc[mask].shape

#### In case there are no filings for company

In [None]:
#mask = data['filings_rows'] == 0
#data.loc[mask, 'reason_to_exclude'] = 'No filings'

### Show what we have

In [None]:
data

#### Show which ones are excluded:
Also stores to csv for later use

In [None]:
#scraping_summary = data[data['reason_to_exclude']!='None']
scraping_summary = data
scraping_summary = scraping_summary[[#'Type'
                      'ISIN CODE'
                     #, 'LOC OFF. CODE'
                     , 'NAME'
                     #, 'DATASTREAM CODE'
                     #, 'CUSIP'
                     , 'TICKER SYMBOL'
                     #, 'BASE OR ST DATE'
                     #, 'DATE/TIME (DS End Date)'
                     #, 'tickers'
                     , 'trade_count'
                     #, 'min_filing_date'
                     #, 'max_filing_date'
                     #, 'min_trade_date'
                     #, 'max_trade_date'
                     , 'n_distinct_traders'
                     , 'n_distinct_trade_types'
                     #, 'P - Purchase (count)'
                     #, 'S - Sale (count)'
                     #, 'S - Sale+OE'
                     #, 'A - Grant (count)'
                     #, 'D - Sale to Iss (count)'
                     #, 'G - Gift (count)'
                     #, 'F - Tax (count)'
                     #, 'M - Option Ex (count)'
                     #, 'X - Option Ex (count)'
                     #, 'C - Cnv Deriv (count)'
                     #, 'W - Inherited (count)'
                     #, 'isin'
                     , 'ts_rows'
                     #, 'name'
                     #, 'ticker'
                     #, 'start_date'
                     #, 'end_date'
                     #, 'start_date_ts'
                     #, 'end_date_ts'
                     , 'reason_to_exclude'
                    ]].sort_values(by=['reason_to_exclude', 'TICKER SYMBOL', 'ISIN CODE'])
scraping_summary.to_csv(DATA_LOCATION + '/scraping_summary.csv')
scraping_summary

#### Do basic statistics of what was taken out


In [None]:
agg_scraping_summary = scraping_summary.groupby('reason_to_exclude').agg({'ISIN CODE': 'count'
                                                                          , 'trade_count': 'sum'
                                                                          , 'n_distinct_traders':'sum'
                                                                          , 'ts_rows':'sum'}).reset_index().rename(columns={'reason_to_exclude': 'Exclusion Reason'
                                                                                                                           , 'ISIN CODE': 'N Companies'
                                                                                                                           , 'trade_count': 'N trades'
                                                                                                                           , 'n_distinct_traders': 'N distinct traders'
                                                                                                                           , 'ts_rows': 'N TS rows'})
agg_scraping_summary["N TS rows"] = agg_scraping_summary["N TS rows"].astype(int)

##### And print it to latex code for report

In [None]:
overall_companies = agg_scraping_summary.iloc[:,1:].sum(axis=0).values
overall_companies = pd.DataFrame(["Before Filtering", " "] + list(overall_companies)).transpose()
overall_companies.index = [0]
overall_companies.columns = [" "] + list(agg_scraping_summary.columns)
print(display_table(overall_companies))

agg_scraping_summary_tom = agg_scraping_summary.copy()
agg_scraping_summary_tom.insert(0, " ", len(agg_scraping_summary)*[" "])
agg_scraping_summary_tom.columns = [""] + [(i+1)*" " for i in range((len(agg_scraping_summary_tom.columns)-1))]
print(display_table(agg_scraping_summary_tom.iloc[:-1,:]))

removed = agg_scraping_summary_tom.iloc[:-1,:].sum(axis = 0)
remaining = agg_scraping_summary_tom.iloc[-1,:]
result = pd.DataFrame({"Removed": removed, "Remaining": remaining}).transpose().reset_index()
print(display_table(result))

In [None]:
result

In [None]:
latex = display_table(agg_scraping_summary, column_format="lrrrr")
print(latex)