# Copula Fitting for Feature Variables (cop_3y & cop_5y) - kaggle
Copulas are fitted using a three years and a five years rolling window.    
   
_search in file for "CONFIG" to find code to be configurated_

In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import pandas as pd
import platform
import pyvinecopulib as pv
import random
import seaborn as sns
from sqlalchemy import create_engine
import sys

from fitcopula import fitShiftedAnnually
from fitcopula import fitShiftedAnnuallyTest
from fitcopula import fitShiftedAnnuallyRandTest
from fitcopula import singleStockCheck

date_format = "%Y-%m-%d"

In [2]:
# copula family set (parametric only):
'''
family_set = [pv.BicopFamily.indep,
            pv.BicopFamily.gaussian,
            pv.BicopFamily.student,
            pv.BicopFamily.clayton,
            pv.BicopFamily.gumbel,
            pv.BicopFamily.frank,
            pv.BicopFamily.joe,
            pv.BicopFamily.bb1,
            pv.BicopFamily.bb6,
            pv.BicopFamily.bb7,
            pv.BicopFamily.bb8]
'''

'\nfamily_set = [pv.BicopFamily.indep,\n            pv.BicopFamily.gaussian,\n            pv.BicopFamily.student,\n            pv.BicopFamily.clayton,\n            pv.BicopFamily.gumbel,\n            pv.BicopFamily.frank,\n            pv.BicopFamily.joe,\n            pv.BicopFamily.bb1,\n            pv.BicopFamily.bb6,\n            pv.BicopFamily.bb7,\n            pv.BicopFamily.bb8]\n'

In [3]:
# copula family set (kendalL's tau inversion only):
family_set = [pv.BicopFamily.indep,
            pv.BicopFamily.gaussian,
            pv.BicopFamily.student,
            pv.BicopFamily.clayton,
            pv.BicopFamily.gumbel,
            pv.BicopFamily.frank,
            pv.BicopFamily.joe]

In [4]:
# start psql server
!brew services start postgresql
                                                         
# psql database connections
eikon_data_timeseries = create_engine('postgresql://master:thesis@localhost:5432/eikon_data_timeseries')
stock_market_data     = create_engine('postgresql://master:thesis@localhost:5432/stock_market_data')

Service `postgresql` already started, use `brew services restart postgresql` to restart.


In [5]:
my_os = platform.system()
print("OS in my system : ",my_os)

if my_os == "Windows":
    path = str(pathlib.Path().absolute()) + '\\'
    slash = '\\'
else:
    path = str(pathlib.Path().absolute()) + '/'
    slash = '/'

OS in my system :  Darwin


## General stock information (from eikon, e)

In [6]:
# load general stock information
e_stock_info = pd.read_excel(path + 'data_artifacts' + slash + 'eikon_general_stock_info.xlsx')

In [7]:
# oranizing columns
e_stock_info = e_stock_info.drop(columns=['name', 'equity_type'])

# convert date columns to datetime
e_stock_info['inc_date'] = pd.to_datetime(e_stock_info['inc_date'])
e_stock_info['ipo_date'] = pd.to_datetime(e_stock_info['ipo_date'])
e_stock_info['fy_end_date'] = pd.to_datetime(e_stock_info['fy_end_date'])
e_stock_info['fy_end_date'] = e_stock_info['fy_end_date'].dt.month
e_stock_info.head()

Unnamed: 0,ric,isin,inc_date,ipo_date,naics,fy_end_date,symbol
0,AAPL.ITC,US0378331005,1977-01-03,1980-12-12,Manufacturing,9.0,AAPL
1,AMZN.ITC,US0231351067,1996-05-28,1997-05-15,Retail Trade,12.0,AMZN
2,TSLA.ITC,US88160R1014,2003-07-01,2010-06-09,Manufacturing,12.0,TSLA
3,MSFT.ITC,US5949181045,1993-09-22,1986-03-13,Information,6.0,MSFT
4,TRI.ITC,CA8849037095,1977-12-28,2002-06-12,Information,12.0,TRI


## Data sets

**kaggle:**
* k_stock_returns (stock log returns)
* k_market_spx_returns (market log returns | market proxy: S&P500)
* k_market_ndx_returns (market log returns | market proxy: Nasdaq)

In [None]:
# load data sets

# ---------------- CONFIG ----------------
# month_day_hour_name to be configurated in table name

k_stock_returns       = pd.read_sql("select * from \"1_2_7_k_stock_returns\"", stock_market_data);
k_stock_returns       = k_stock_returns.drop(columns='index').sort_values(by=['symbol', 'date'])

k_market_spx_returns  = pd.read_sql("select * from \"1_2_7_k_market_spx_returns\"", stock_market_data);
k_market_spx_returns  = k_market_spx_returns.drop(columns='index').sort_values(by=['date'])

k_market_ndx_returns  = pd.read_sql("select * from \"1_2_7_k_market_ndx_returns\"", stock_market_data);
k_market_ndx_returns  = k_market_ndx_returns.drop(columns='index').sort_values(by=['date'])

In [None]:
print('Free of duplicates:')
print(len(k_stock_returns) == len(k_stock_returns.drop_duplicates()))
print(len(k_market_spx_returns) == len(k_market_spx_returns.drop_duplicates()))
print(len(k_market_ndx_returns) == len(k_market_ndx_returns.drop_duplicates()))

## Copula Fitting
Tau is taken as universal dependency measure as it is reflects the copula regardless of the underlying copula family. The _pyvinecopulib.Bicop()_ method from Python package _pyvinecopulib_ returns the best fitting copula on given input data. The copulas are fitted per stock company for each year between 2011 and 2020 taking into account deviating fiscal years. Rolling windows are not considered.

### 3 Years Window
**Stock returns and S&P 500 market proxy:**

In [None]:
k_cop_3y_spx = fitShiftedAnnually(k_stock_returns, k_market_spx_returns, e_stock_info, family_set, 3)
display(k_cop_3y_spx)
print('Proportion of Tau zero values:')
print(((k_cop_3y_spx['tau'] == 0).sum())/(len(k_cop_3y_spx)))

In [None]:
k_cop_3y_spx.describe()

In [None]:
sns.histplot(data = k_cop_3y_spx, x = 'tau', log_scale = (False, True), bins = 200)

In [None]:
# safe to excel
today = datetime.today()
storage_name = str(today.month) + '_' + str(today.day) + '_' + str(today.hour) + '_' + 'k_cop_3y_spx.xlsx'

k_cop_3y_spx.to_excel(path + 'data_artifacts' + slash + 'copulas' + slash + storage_name)

**Stock returns and Nasdaq market proxy:**

In [None]:
k_cop_3y_ndx = fitShiftedAnnually(k_stock_returns, k_market_ndx_returns, e_stock_info, family_set, 3)
display(k_cop_3y_ndx)
print('Proportion of Tau zero values:')
print(((k_cop_3y_ndx['tau'] == 0).sum())/(len(k_cop_3y_ndx)))

In [None]:
k_cop_3y_ndx.describe()

In [None]:
sns.histplot(data = k_cop_3y_ndx, x = 'tau', log_scale = (False, True), bins = 200)

In [None]:
# safe to excel
today = datetime.today()
storage_name = str(today.month) + '_' + str(today.day) + '_' + str(today.hour) + '_' + 'k_cop_3y_ndx.xlsx'

k_cop_3y_ndx.to_excel(path + 'data_artifacts' + slash + 'copulas' + slash + storage_name)

### 5 Years Window
**Stock returns and S&P 500 Index - CBOE:**

In [None]:
k_cop_5y_spx = fitShiftedAnnually(k_stock_returns, k_market_spx_returns, k_stock_info, family_set, 5)
display(k_cop_5y_spx)
print('Proportion of Tau zero values:')
print(((k_cop_5y_spx['tau'] == 0).sum())/(len(k_cop_5y_spx)))

In [None]:
k_cop_5y_spx.describe()

In [None]:
sns.histplot(data = k_cop_5y_spx, x = 'tau', log_scale = (False, True), bins = 200)

In [None]:
# safe to excel
today = datetime.today()
storagk_name = str(today.month) + '_' + str(today.day) + '_' + str(today.hour) + '_' + 'k_cop_5y_spx.xlsx'

k_cop_5y_spx.to_excel(path + 'data_artifacts' + slash + 'copulas' + slash + storagk_name)

**Stock returns and NASDAQ 100 Index:**

In [None]:
k_cop_5y_ndx = fitShiftedAnnually(k_stock_returns, k_market_ndx_returns, k_stock_info, family_set, 5)
display(k_cop_5y_ndx)
print('Proportion of Tau zero values:')
print(((k_cop_5y_ndx['tau'] == 0).sum())/(len(k_cop_5y_ndx)))

In [None]:
k_cop_5y_ndx.describe()

In [None]:
sns.histplot(data = k_cop_5y_ndx, x = 'tau', log_scale = (False, True), bins = 200)

In [None]:
# safe to excel
today = datetime.today()
storagk_name = str(today.month) + '_' + str(today.day) + '_' + str(today.hour) + '_' + 'k_cop_5y_ndx.xlsx'

k_cop_5y_ndx.to_excel(path + 'data_artifacts' + slash + 'copulas' + slash + storagk_name)