## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

from statsmodels.tsa.vector_ar.vecm import coint_johansen as cj
from tqdm import tqdm

## Data Preprocessing

In [2]:
# Reading the csv with the US market data
data = pd.read_csv('US_market.csv', index_col = 0)
data.index = pd.to_datetime(data.index)

# Taking only the tickers with less than a certain percentage threshold of NA
threshold = 0.0032
mask = np.array(data.isna().sum()/data.shape[0]) < threshold
df = data.iloc[:,mask]

df = df.fillna(method = 'ffill')
df = df.fillna(method = 'bfill')

columns_duplicated = [elem for elem in df.columns if '.' in elem]
df = df.drop(columns = columns_duplicated)
df = df[df.index.to_series().dt.dayofweek.isin([2])]

# Rounding the data to the second decimal that is the minimum ticker movement
df = round(df,2)

## Finding cointegration

function 'coint_all_timeframe'

INPUT
- __T__: the final time
- __train__: the number of timestep to use as a base
- __start__: the first timestep to start looking from (default is 0)
- __N__: number of elements in every combination (default 2)

OUTPUT
- __combinations__: a list of all the combinations of N elements of the tickers
- __count__: a list of the number of non-zero weights for every combination
- __weights__: a list of lists where each list is the normalized weights (in the sense that the weight of the fisrt element of the combination is always 1) at every timestep t

WORKING

The function does, for every element in the combination list and for every timestep, the Johansen cointegration and if they are cointegrated appends the normalized weight in the corresponding weights place and increses by one the corresponding count, otherwise appends a zero in the corrisponding weights place.

In [3]:
def coint_all_timeframe(T, train, start = 0, N = 2):
    index_ = [elem for elem in range(df.shape[1])]
    combinations = list(itertools.combinations(index_, N))
    combinations = [list(elem) for elem in combinations]

    list_index = list()
    weights = list()
    count = list()

    for elem in tqdm(combinations):
        count_aux = 0
        weights_aux = list()
        i = train + start
        
        while i < T:
        
            res = cj(df.iloc[start:i, elem], 0, 1)
            check = True

            for j in range(N):
                if res.lr1[j] < res.cvt[j][2]:
                    check = False
                    break

            if check:
                aux = res.evec[:,0]/res.evec[0,0]
                count_aux += 1
                weights_aux.append(aux[1])
                        
            else:
                weights_aux.append(0.)
            i += 1

        weights.append(weights_aux)
        count.append(count_aux)
        
    return combinations, count, weights

In [4]:
pairings, count, weights = coint_all_timeframe(df.shape[0], 40)

## Create Objective

In [5]:
pairings_list = list()
index_list = list()

n = len(pairings_list)
T = len(weights[0])
t = 0

to_track = pd.DataFrame({'t': pd.Series(dtype='int'),
                         'Pairings': pd.Series(dtype='object'),
                         'Weights': pd.Series(dtype='object')})

while t < T:
    for i in tqdm(range(n)):
        new = {'t': [t],
           'Pairings': [pairings_list[i]],
           'Weights': [weights[index_list[i]][t]]}
       
        to_track = pd.concat([to_track, pd.DataFrame(new)], ignore_index = True)
    t += 1

In [11]:
to_track.to_csv('to_track.csv', index = False)