In [65]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.vector_ar.vecm import *
from patsy import dmatrices
import itertools

In [66]:
class PairsTrade:
    '''Class for Backtesting a Pairs Trading strategy using the cointegration method of pair creation with speed of adjustment filtering'''

    def __init__(self, start, end, formation_period, trading_period, filter): # Constructor Method
        
        self.start = start
        self.end = end
        self.formation_period = slice(*formation_period)
        self.trading_period = slice(*trading_period)
        self.filter = filter
        self.get_data()
        self.get_close_prices()
        self.clean_data()  
        self.intorder()
        self.normalize_price_series()
        self.cointegration_testing()
        self.speedfiltering()

    def get_data(self):

        self.table = pd.read_html('https://en.wikipedia.org/wiki/FTSE_100_Index')[4]
        self.tickers_raw = self.table.iloc[:,1]
        self.tickers_Series = self.tickers_raw + ".L"
        self.tickers = self.tickers_Series.tolist()
        self.data = yf.download(self.tickers, start = self.start, end = self.end)
        if self.data.empty:
            print(f'No Data pulled for period {self.start} : {self.end}')

        else:
            swapped_columnindex = self.data.columns.swaplevel(0,1)
            data_copy = self.data.copy()
            data_copy.columns = swapped_columnindex
            data_copy = data_copy.sort_index(axis = 1, level = 0)
            self.data = data_copy
            self.data.columns = self.data.columns.set_levels(self.data.columns.levels[0].str.replace('.L', "", regex = False), level = 0)
            print("Data successfully retrieved")
            print('=' * 55)

    def plot_stock(self,symbol, cols=None):
        self.symbol = symbol
        if cols is None:
            cols = "Close"
        self.data.xs(symbol, level = 0, axis = 1)[cols].plot(figsize = (10,6), title = (f'A Graph of {self.symbol} {cols} Prices'))

    def get_close_prices(self):
        self.df = self.data.xs('Close', level=1, axis=1)

    
    def get_date_price(self, symbol , bar):
        rawdate = pd.to_datetime(bar)
        if rawdate in self.df.index:
            price = self.df.loc[rawdate, symbol]
            return str(rawdate.date()), price
        else:
            print(f"Date {rawdate} not found in data.")

    def clean_data(self):
        initial_column_count = self.df.shape[1]
        self.df_cleaned = self.df.dropna(axis=1, how = 'any')
        dropped_columns_count = initial_column_count - self.df_cleaned.shape[1]
        print(f"Number of columns dropped during cleaning for stationarity testing: {dropped_columns_count}")
        total_nans_cleaned = self.df_cleaned.isna().sum().sum()
        print(f"Total Number of NANs in the entire DataFrame after column removal: {total_nans_cleaned}")
        self.df = self.df_cleaned

    # Order of Integration Testing

    def intorder(self):
        def adf_test(series):
            result = sm.tsa.stattools.adfuller(series, autolag = 'BIC')
            return result[1]
        self.adfp = self.df.apply(adf_test, axis = 0) > 0.05
        diff_series = self.df.loc[:, self.adfp].diff().dropna()
        self.adfp2 = diff_series.apply(adf_test, axis = 0) > 0.05
        adfp2_alligned = self.adfp2.reindex(self.df.columns, fill_value = True)
        self.i1_prices = self.df.loc[:, ~adfp2_alligned]
        print(f" {len(self.i1_prices.columns)} I(1) Stocks")
        print("=" * 55)


    def normalize_price_series(self):
        self.norm_i1prices = self.i1_prices/ self.i1_prices.iloc[0]

    # Cointegration testing

    def cointegration_testing(self):
        self.formation_prices = self.norm_i1prices[self.formation_period]
        self.trading_prices = self.norm_i1prices[self.trading_period]

        self.stock_combinations = list(itertools.combinations(self.norm_i1prices.columns, 2))
        self.cointegrated_pairs = []
        self.cointegration_results = {
            'Stock 1': [],
            'Stock 2': [],
            'Alpha': [],
            'Average Alpha' : [],
            'Beta': []
        }
        for stock1 , stock2 in self.stock_combinations:

            Pairs_close = self.formation_prices[[stock1, stock2]]
            Pairs_close = Pairs_close.asfreq('B').ffill()

            #lag selection 
            var_model = sm.tsa.VAR(Pairs_close)
            lag_selection = var_model.select_order(maxlags = 15, trend = 'c')
            optimal_lag = lag_selection.bic        
            # Serial autocorrelation test - issue need to return
            #self.residuals = self.var_result.resid
            #out = sm.stats.stattools.durbin_watson(var_model.resid)
            #print(out) 

            coint_johansen_result = coint_johansen(Pairs_close, 0, optimal_lag)
            trace_stat = coint_johansen_result.lr1
            trace_critical_value = coint_johansen_result.cvt
           
            if trace_stat[0] > trace_critical_value[1,2]:
                print(f"For Stock Pair {stock1}, {stock2}:")
                print(f"Trace Statistic: {trace_stat}")
                print(f"Trace Statistic Critical Values (10%, 5%, 1%):\n{trace_critical_value}")
                print(f"Stock Pair {stock1}, {stock2} is cointegrated")
                print("=" * 55)
                self.cointegrated_pairs.append((stock1, stock2))

                model = VECM(Pairs_close, deterministic = 'ci', seasons = 0, k_ar_diff= optimal_lag , coint_rank= 1) # rank n.b
                vecm_result= model.fit()

                       
                self.cointegration_results['Stock 1'].append(stock1)
                self.cointegration_results['Stock 2'].append(stock2)
                self.cointegration_results['Alpha'].append(vecm_result.alpha)
                self.cointegration_results['Average Alpha'] =  [(alpha[0] + alpha[1]) / 2 for alpha in self.cointegration_results['Alpha']] # magnitude
                self.cointegration_results['Beta'].append(vecm_result.beta)
                
        self.cointegration_df = pd.DataFrame(self.cointegration_results)
        self.cointegrated_pairs = self.cointegration_df.iloc[:, :2]
        
        print(f"Out of {len(self.stock_combinations)}  possible pairs there are {len(self.cointegration_df)} cointegrated pairs")


    def speedfiltering(self):
        self.filtered_pairs = self.cointegration_df[self.cointegration_df['Average Alpha'] > self.filter]

        print(f"Out of {len(self.cointegration_df)} Cointegrated Pairs there are {len(self.filtered_pairs)} Pairs with a Speed of Adjustment Coefficient greater than {self.filter}.")
        

    # def signals

    # need to calculate spread 
    # def run strategy

    #calc returns
    

    










            









        
        



        

        



        


In [67]:
self = PairsTrade(start = "2022-01-01", end = "2023-12-25", formation_period=("2022-01-01", "2023-01-01"), trading_period=("2023-01-01", "2023-12-25"), filter = 0.01 )

[*********************100%%**********************]  100 of 100 completed


Data successfully retrieved
Number of columns dropped during cleaning for stationarity testing: 16
Total Number of NANs in the entire DataFrame after column removal: 0
 64 I(1) Stocks
For Stock Pair ABF, ADM:
Trace Statistic: [14.94021085  4.79066259]
Trace Statistic Critical Values (10%, 5%, 1%):
[[13.4294 15.4943 19.9349]
 [ 2.7055  3.8415  6.6349]]
Stock Pair ABF, ADM is cointegrated
For Stock Pair ABF, AHT:
Trace Statistic: [13.04232355  3.04813952]
Trace Statistic Critical Values (10%, 5%, 1%):
[[13.4294 15.4943 19.9349]
 [ 2.7055  3.8415  6.6349]]
Stock Pair ABF, AHT is cointegrated
For Stock Pair ABF, ANTO:
Trace Statistic: [7.34989407 2.2494794 ]
Trace Statistic Critical Values (10%, 5%, 1%):
[[13.4294 15.4943 19.9349]
 [ 2.7055  3.8415  6.6349]]
Stock Pair ABF, ANTO is cointegrated
For Stock Pair ABF, AUTO:
Trace Statistic: [14.20567079  3.21138024]
Trace Statistic Critical Values (10%, 5%, 1%):
[[13.4294 15.4943 19.9349]
 [ 2.7055  3.8415  6.6349]]
Stock Pair ABF, AUTO is coi

In [70]:
cointegration_results = self.cointegration_df
pairs = self.cointegrated_pairs
filtered = self.filtered_pairs

In [69]:
# obtain speed of adjustment coefficients and filter based on speed

# n.b the high percentage of cointegrated pairs

# trading strategy / signals

# back test

help(VECM)


Help on class VECM in module statsmodels.tsa.vector_ar.vecm:

class VECM(statsmodels.tsa.base.tsa_model.TimeSeriesModel)
 |  VECM(endog, exog=None, exog_coint=None, dates=None, freq=None, missing='none', k_ar_diff=1, coint_rank=1, deterministic='n', seasons=0, first_season=0)
 |
 |  Class representing a Vector Error Correction Model (VECM).
 |
 |  A VECM(:math:`k_{ar}-1`) has the following form
 |
 |  .. math:: \Delta y_t = \Pi y_{t-1} + \Gamma_1 \Delta y_{t-1} + \ldots + \Gamma_{k_{ar}-1} \Delta y_{t-k_{ar}+1} + u_t
 |
 |  where
 |
 |  .. math:: \Pi = \alpha \beta'
 |
 |  as described in chapter 7 of [1]_.
 |
 |  Parameters
 |  ----------
 |  endog : array_like (nobs_tot x neqs)
 |      2-d endogenous response variable.
 |  exog : ndarray (nobs_tot x neqs) or None
 |      Deterministic terms outside the cointegration relation.
 |  exog_coint : ndarray (nobs_tot x neqs) or None
 |      Deterministic terms inside the cointegration relation.
 |  dates : array_like of datetime, optional
 