In [43]:
%%capture
%pip install pandas numpy matplotlib statsmodels pandas_datareader datetime yfinance scikit-learn PyPortfolioOpt pandas_ta

In [44]:
%%capture
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

In [45]:
# Define some global variables

global timeframe
timeframe = 2

In [46]:
# FTSE250 loading and manipulating

def load(
        end_date: str = '2024-09-10',
        timeframe: int = 2
) -> pd.DataFrame:
    
    # Extract a list of current FTSE250 tickers for end_date and timeframe
    # Dependant on website layout and column names
    ftse250 = pd.read_html('https://en.wikipedia.org/wiki/FTSE_250_Index')[3]
    tickers_ftse250 = ftse250['Ticker'].str.replace('.', '-').unique().tolist()

    # Create dataframe from yfinance api for each constituent in index
    # Use a daily frequency over timeframe from end_date

    start_date = pd.to_datetime(end_date)-pd.DateOffset(365*timeframe)

    df =  yf.download(tickers=tickers_ftse250,
                    start=start_date,
                    end=end_date).stack()

    return df



In [48]:
# Manipulate df, calculate features and technical indicators for each stock

def enrich_df(df: pd.DataFrame) -> pd.DataFrame:

    # Assign new names for multi-index df
    df.index.names = ['date', 'ticker']

    # Assign column headers to strings (easier for manipulation later)
    df.columns = df.columns.str.lower()

    # Convert the `date` index to just the date part
    df.index = df.index.set_levels([pd.to_datetime(df.index.levels[0]).date, df.index.levels[1]])

    # Append columns with useful financial metrics
    df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)
    df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))
    df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])                                                        
    df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])                                                       
    df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

    def compute_atr(stock_data):
        atr = pandas_ta.atr(high=stock_data['high'],
                            low=stock_data['low'],
                            close=stock_data['close'],
                            length=14)
        return atr.sub(atr.mean()).div(atr.std())

    df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

    def compute_macd(close):
        macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
        return macd.sub(macd.mean()).div(macd.std())

    df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)
    df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

    # Filter down to the useful columns
    filter = ['volume', 'open', 'high', 'low', 'close']
    df.drop(columns=filter,inplace=True)

    # Could consider liquidity issue or training data size, okay for now

    # Compute the returns over all months in timeperiod
    def calculate_returns(df):

        outlier_cutoff = 0.005

        # Use sparingly given size of dataframe this could create
        lags = np.arange(1,12*timeframe)

        for lag in lags:

            df[f'return_{lag}m'] = (df['adj close']
                                .pct_change(lag)
                                .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                        upper=x.quantile(1-outlier_cutoff)))
                                .add(1)
                                .pow(1/lag)
                                .sub(1))
        return df

    df = calculate_returns(df)

    return df.dropna()


In [49]:
df = load(timeframe=timeframe)
data = enrich_df(df)
data

Unnamed: 0_level_0,Price,adj close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,dollar_volume,return_1m,...,return_14m,return_15m,return_16m,return_17m,return_18m,return_19m,return_20m,return_21m,return_22m,return_23m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-10-17,AGR,36.249561,-0.003303,29.504232,3.536812,3.675440,3.814068,2.339530,-4.940883,20.669500,0.622577,...,-0.008199,0.030319,0.109315,0.024318,0.030784,-0.030799,-0.007899,0.028405,0.025668,0.061818
2022-10-17,BAB,23.247581,-0.002820,33.551836,3.183611,3.209376,3.235141,2.444211,-2.543398,17.447310,-0.358680,...,0.039065,-0.036612,0.000238,0.074128,-0.001985,0.005357,-0.050605,-0.028298,0.006565,0.004940
2022-10-17,BBGI,21.000000,0.001190,37.851169,3.045831,3.123811,3.201792,-0.518957,-1.258803,0.032235,-0.096680,...,0.083762,0.029412,-0.040480,-0.005741,0.063843,-0.007208,-0.000008,-0.052852,-0.031516,0.001840
2022-10-17,BBH,147.312363,0.000046,53.270643,4.915640,4.962321,5.009002,2.554434,0.261974,3.078828,6.014874,...,0.212766,0.227457,0.160590,0.078637,0.108255,0.174871,0.094754,0.097194,0.037398,0.055554
2022-10-17,BBY,59.843304,-0.001768,41.030074,4.044408,4.113085,4.181762,1.571142,-1.694218,166.753366,-0.593766,...,0.322129,0.127484,0.145491,0.091091,0.021679,0.051234,0.114114,0.044274,0.048744,-0.004039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-09,TRN,30.316475,0.000031,42.522163,3.460251,3.493973,3.527695,0.713421,-0.527383,15.864611,-0.358653,...,0.013492,-0.074377,-0.064804,0.001090,0.008119,0.051614,0.117610,0.092717,-0.071970,-0.191434
2024-09-09,TRST,32.610001,0.000107,50.765138,3.477968,3.528955,3.579941,0.499514,0.523662,2.771850,0.075653,...,0.029656,0.017522,-0.065646,-0.057075,0.005094,0.011565,0.052803,0.115575,0.091935,-0.065995
2024-09-09,TRY,345.500000,0.000101,58.954848,5.805793,5.827415,5.849038,-0.351960,0.417381,190.383629,9.594909,...,0.187143,0.202779,0.177988,0.077819,0.078562,0.137740,0.137625,0.175161,0.235763,0.205329
2024-09-09,USA,6.830000,0.000031,52.101822,2.039968,2.059383,2.078798,-0.045731,-0.001159,4.852715,-0.980232,...,-0.004947,-0.096487,-0.069594,-0.073769,-0.136883,-0.126162,-0.070949,-0.062031,-0.023988,0.032404


In [50]:
# Fama-french data to estimate the exposure of assets to common risk factors using linear regression
# Five main factors are market risk, size, value, operating profitability and investment

def fama_french(data: pd.DataFrame) -> pd.DataFrame:
    factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
                                'famafrench',
                                start='2010')[0].drop('RF', axis=1)

    factor_data.index = factor_data.index.to_timestamp()

    factor_data = factor_data.resample('M').last().div(100)

    factor_data.index.name = 'date'

    factor_data = factor_data.join(data['return_1m']).sort_index()

    # Filter out stocks with less than 10 months of data

    observations = factor_data.groupby(level=1).size()

    valid_stocks = observations[observations >= 10]

    factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]

    return factor_data


Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-10-31,AGR,0.0783,0.0188,0.0806,0.0331,0.0662,0.513542
2022-10-31,BAB,0.0783,0.0188,0.0806,0.0331,0.0662,-0.375210
2022-10-31,BBGI,0.0783,0.0188,0.0806,0.0331,0.0662,-0.087824
2022-10-31,BBH,0.0783,0.0188,0.0806,0.0331,0.0662,6.420268
2022-10-31,BBY,0.0783,0.0188,0.0806,0.0331,0.0662,-0.599448
...,...,...,...,...,...,...,...
2024-07-31,TRST,0.0124,0.0828,0.0573,0.0022,0.0043,0.073814
2024-07-31,TRY,0.0124,0.0828,0.0573,0.0022,0.0043,8.651162
2024-07-31,USA,0.0124,0.0828,0.0573,0.0022,0.0043,-0.979971
2024-07-31,WPS,0.0124,0.0828,0.0573,0.0022,0.0043,3.069016
