In [1]:
import pandas as pd
import numpy as np 
from datetime import datetime,timezone
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf


In [2]:
#getting GME daily stock data before the squeeze 
GME = yf.download(["GME"], start="2020-07-01", end="2020-12-31", interval = "1d", auto_adjust = True)
print(GME.head())
print(GME.shape)

[*********************100%***********************]  1 of 1 completed

              Open    High     Low   Close   Volume
Date                                               
2020-07-01  1.0775  1.1250  1.0775  1.1100  9214800
2020-07-02  1.1225  1.1275  1.0725  1.0725  7550400
2020-07-06  1.0775  1.0850  1.0475  1.0600  8563600
2020-07-07  1.0500  1.0625  1.0150  1.0225  9826400
2020-07-08  1.0250  1.0725  1.0075  1.0650  8211200
(127, 5)





In [3]:
#getting stock data based on BICS Best Fit Algo for 6 months before the squeeze

bics_data = yf.download(["CRCT","EVGO","LESL","EYE", "FCFS","FRPT","WRBY", "SBH", "ODP"
                         ], start="2020-07-01", end="2020-12-31", interval = "1d", auto_adjust = True)
print(bics_data.head())
print(bics_data.shape)

[*********************100%***********************]  9 of 9 completed

2 Failed downloads:
['CRCT', 'WRBY']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2020-07-01 -> 2020-12-31) (Yahoo error = "Data doesn\'t exist for startDate = 1593576000, endDate = 1609390800")')


Price                     Adj Close      Close                             \
Ticker                         CRCT WRBY  CRCT EVGO        EYE       FCFS   
Date                                                                        
2020-07-01 00:00:00+00:00       NaN  NaN   NaN  NaN  30.680000  61.733814   
2020-07-02 00:00:00+00:00       NaN  NaN   NaN  NaN  30.620001  63.386295   
2020-07-06 00:00:00+00:00       NaN  NaN   NaN  NaN  30.350000  64.804077   
2020-07-07 00:00:00+00:00       NaN  NaN   NaN  NaN  29.150000  63.048294   
2020-07-08 00:00:00+00:00       NaN  NaN   NaN  NaN  29.510000  61.996719   

Price                                                        ... Open Volume  \
Ticker                          FRPT LESL        ODP    SBH  ... WRBY   CRCT   
Date                                                         ...               
2020-07-01 00:00:00+00:00  87.370003  NaN  22.660000  12.92  ...  NaN    NaN   
2020-07-02 00:00:00+00:00  88.570000  NaN  21.889999  13.08  ..

In [4]:
#getting ticker data based on SIC classification 6 months before the squeeze
sic_data = yf.download(["MRDH","CONNQ","SVSN","CFGX","HGGG","KLGG","BBY"
                         ], start="2020-07-01", end="2020-12-31", interval = "1d", auto_adjust = True)
print(sic_data.head())
print(sic_data.shape)

[*********************100%***********************]  7 of 7 completed

Price                          Close                                         \
Ticker                           BBY    CFGX  CONNQ     HGGG    KLGG   MRDH   
Date                                                                          
2020-07-01 00:00:00+00:00  74.355042  0.0001   9.98  0.00485  0.0013  0.015   
2020-07-02 00:00:00+00:00  74.440170  0.0001   9.78  0.00485  0.0013  0.015   
2020-07-06 00:00:00+00:00  74.652992  0.0001  10.38  0.00500  0.0013  0.015   
2020-07-07 00:00:00+00:00  72.856880  0.0001   9.71  0.00450  0.0013  0.015   
2020-07-08 00:00:00+00:00  74.210342  0.0001   9.80  0.00450  0.0013  0.015   

Price                                 High                 ...    Open         \
Ticker                     SVSN        BBY    CFGX  CONNQ  ...    KLGG   MRDH   
Date                                                       ...                  
2020-07-01 00:00:00+00:00  0.01  75.010493  0.0001  10.33  ...  0.0013  0.015   
2020-07-02 00:00:00+00:00  0.01  75.980903 




In [5]:
#reset tabular data -

df_bics = bics_data.reset_index()
new_columns = [f"{ticker}_{metric}" for ticker, metric in bics_data.columns]
df_bics.columns = ['Date'] + new_columns
df_bics= df_bics.dropna(axis=1, how='any')
df_bics.head()


Unnamed: 0,Date,Close_EYE,Close_FCFS,Close_FRPT,Close_ODP,Close_SBH,High_EYE,High_FCFS,High_FRPT,High_ODP,...,Open_EYE,Open_FCFS,Open_FRPT,Open_ODP,Open_SBH,Volume_EYE,Volume_FCFS,Volume_FRPT,Volume_ODP,Volume_SBH
0,2020-07-01 00:00:00+00:00,30.68,61.733814,87.370003,22.66,12.92,31.530001,64.494233,89.610001,22.870001,...,30.530001,63.724322,83.900002,20.950001,12.69,532800,182200,413000,1055500,2572600
1,2020-07-02 00:00:00+00:00,30.620001,63.386295,88.57,21.889999,13.08,31.719999,63.73369,89.714996,23.379999,...,31.440001,62.691492,88.540001,21.209999,13.15,455700,287800,277300,2908000,1148900
2,2020-07-06 00:00:00+00:00,30.35,64.804077,87.610001,21.23,13.47,31.85,65.123313,90.0,22.49,...,31.620001,64.513015,90.0,22.389999,14.72,471700,281000,303700,2559300,2091300
3,2020-07-07 00:00:00+00:00,29.15,63.048294,86.660004,20.74,13.05,30.35,64.653842,88.339996,21.65,...,30.129999,64.390946,87.239998,21.379999,13.22,526300,269200,164700,2224000,1198400
4,2020-07-08 00:00:00+00:00,29.51,61.996719,88.550003,22.07,13.09,29.84,63.536543,89.199997,22.290001,...,29.15,62.682132,87.110001,20.68,12.91,541900,194300,165300,1110500,1251800


In [6]:
df_sic = sic_data.reset_index()

new_columns = []
for ticker in sic_data.columns.levels[0]:
    for metric in sic_data.columns.levels[1]:
        new_columns.append(f"{ticker}_{metric}")

df_sic.columns = ['Date'] + new_columns
df_sic= df_sic.dropna(axis=1, how='any')
df_sic.head()

Unnamed: 0,Date,Close_BBY,Close_CFGX,Close_CONNQ,Close_HGGG,Close_KLGG,Close_MRDH,Close_SVSN,High_BBY,High_CFGX,...,Open_KLGG,Open_MRDH,Open_SVSN,Volume_BBY,Volume_CFGX,Volume_CONNQ,Volume_HGGG,Volume_KLGG,Volume_MRDH,Volume_SVSN
0,2020-07-01 00:00:00+00:00,74.355042,0.0001,9.98,0.00485,0.0013,0.015,0.01,75.010493,0.0001,...,0.0013,0.015,0.01,2278900,6200,544300,4243,0,0,0
1,2020-07-02 00:00:00+00:00,74.44017,0.0001,9.78,0.00485,0.0013,0.015,0.01,75.980903,0.0001,...,0.0013,0.015,0.01,2489900,6000000,646500,0,0,0,0
2,2020-07-06 00:00:00+00:00,74.652992,0.0001,10.38,0.005,0.0013,0.015,0.01,76.380997,0.0001,...,0.0013,0.015,0.01,2050000,0,491100,295,0,0,0
3,2020-07-07 00:00:00+00:00,72.85688,0.0001,9.71,0.0045,0.0013,0.015,0.01,74.508271,0.0001,...,0.0013,0.015,0.01,2253100,0,552400,86064,0,0,1200
4,2020-07-08 00:00:00+00:00,74.210342,0.0001,9.8,0.0045,0.0013,0.015,0.01,74.244392,0.0001,...,0.0013,0.015,0.01,2058400,2000000,299200,138350,0,0,5000


In [7]:
#calculating- mcap, returns and volatility 
GME = GME.reset_index()
GME['Mcap'] = GME['Volume']*(0.5*(GME['Open'] + GME['Close']))
GME['Date'] = pd.to_datetime(GME['Date'])
GME['Return'] = GME['Close'].pct_change().fillna(0) 

def calculate_intraday_volatility(df):
    volatility = np.sqrt(0.5 * (np.log(df['High']) - np.log(df['Low']))**2 
                         - (2 * np.log(2) - 1) * (np.log(df['Close']) - np.log(df['Open']))**2)
    return volatility
GME['Volatility'] = calculate_intraday_volatility(GME) # Garman and Klass volatility formula

In [8]:
GME.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Mcap,Return,Volatility
0,2020-07-01,1.0775,1.125,1.0775,1.11,9214800,10078690.0,0.0,0.024277
1,2020-07-02,1.1225,1.1275,1.0725,1.0725,7550400,8286564.0,-0.033784,0.021177
2,2020-07-06,1.0775,1.085,1.0475,1.06,8563600,9152347.0,-0.011655,0.022694
3,2020-07-07,1.05,1.0625,1.015,1.0225,9826400,10182610.0,-0.035377,0.027817
4,2020-07-08,1.025,1.0725,1.0075,1.065,8211200,8580704.0,0.041565,0.037259


In [9]:
#merge bics and sic columns
match_df = pd.merge(df_sic, df_bics, on = 'Date', how = "inner")
match_df.columns

Index(['Date', 'Close_BBY', 'Close_CFGX', 'Close_CONNQ', 'Close_HGGG',
       'Close_KLGG', 'Close_MRDH', 'Close_SVSN', 'High_BBY', 'High_CFGX',
       'High_CONNQ', 'High_HGGG', 'High_KLGG', 'High_MRDH', 'High_SVSN',
       'Low_BBY', 'Low_CFGX', 'Low_CONNQ', 'Low_HGGG', 'Low_KLGG', 'Low_MRDH',
       'Low_SVSN', 'Open_BBY', 'Open_CFGX', 'Open_CONNQ', 'Open_HGGG',
       'Open_KLGG', 'Open_MRDH', 'Open_SVSN', 'Volume_BBY', 'Volume_CFGX',
       'Volume_CONNQ', 'Volume_HGGG', 'Volume_KLGG', 'Volume_MRDH',
       'Volume_SVSN', 'Close_EYE', 'Close_FCFS', 'Close_FRPT', 'Close_ODP',
       'Close_SBH', 'High_EYE', 'High_FCFS', 'High_FRPT', 'High_ODP',
       'High_SBH', 'Low_EYE', 'Low_FCFS', 'Low_FRPT', 'Low_ODP', 'Low_SBH',
       'Open_EYE', 'Open_FCFS', 'Open_FRPT', 'Open_ODP', 'Open_SBH',
       'Volume_EYE', 'Volume_FCFS', 'Volume_FRPT', 'Volume_ODP', 'Volume_SBH'],
      dtype='object')

In [10]:
#same for sic and bics data - 

tickers = ['EYE', 'FCFS', 'FRPT', 'ODP','SBH','BBY','CFGX','CONNQ','HGGG', 'SVSN','KLGG','MRDH']

# Function to calculate Garman-Klass volatility for a single ticker
def calculate_intraday_volatility(df, ticker):
    high = df[f'High_{ticker}']
    low = df[f'Low_{ticker}']
    close = df[f'Close_{ticker}']
    open_price = df[f'Open_{ticker}']
    
    volatility = np.sqrt(0.5 * (np.log(high) - np.log(low))**2
                         - (2 * np.log(2) - 1) * (np.log(close) - np.log(open_price))**2)
    return volatility

for ticker in tickers:
    match_df[f'Volatility_{ticker}'] = calculate_intraday_volatility(match_df, ticker)
    match_df[f'Mcap_{ticker}'] = (match_df[f'Close_{ticker}'] + match_df[f'Open_{ticker}'])* match_df[f'Volume_{ticker}'] * 0.5
    match_df[f'Return_{ticker}'] = match_df[f'Close_{ticker}'].pct_change().fillna(0)




In [11]:
#subsetting data - 
subset_match_df = match_df[['Date'] + [col for col in match_df.columns if col.startswith(('Volatility', 'Return', 'Mcap'))]]
sub_gme = GME[['Date','Return', 'Mcap', 'Volatility']]

In [12]:
#merge for matching 
sub_gme['Date'] = sub_gme['Date'].dt.tz_localize('UTC')
merge_df = pd.merge(sub_gme,subset_match_df, on = 'Date', how = "inner")
merge_df.rename(columns={'Return': 'Return_GME', 'Mcap': 'Mcap_GME','Volatility':'Volatility_GME'}, inplace=True)
merge_df = merge_df.drop('Date', axis=1)
merge_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_gme['Date'] = sub_gme['Date'].dt.tz_localize('UTC')


Unnamed: 0,Return_GME,Mcap_GME,Volatility_GME,Volatility_EYE,Mcap_EYE,Return_EYE,Volatility_FCFS,Mcap_FCFS,Return_FCFS,Volatility_FRPT,...,Return_HGGG,Volatility_SVSN,Mcap_SVSN,Return_SVSN,Volatility_KLGG,Mcap_KLGG,Return_KLGG,Volatility_MRDH,Mcap_MRDH,Return_MRDH
0,0.0,10078690.0,0.024277,0.02469,16306340.0,0.0,0.02466,11429240.0,0.0,0.039155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.033784,8286564.0,0.021177,0.028181,14140370.0,-0.001956,0.016111,18142590.0,0.026768,0.017595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.011655,9152347.0,0.022694,0.032595,14615620.0,-0.008818,0.014357,18169050.0,0.022367,0.020758,...,0.030928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.035377,10182610.0,0.027817,0.021159,15599530.0,-0.039539,0.012793,17153320.0,-0.027094,0.01785,...,-0.1,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.041565,8580704.0,0.037259,0.029026,15893930.0,0.01235,0.023927,12112550.0,-0.016679,0.015407,...,0.0,0.0,49.999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.preprocessing import MinMaxScaler
#normalising variables
scaler = MinMaxScaler()
merge_df_scaled = pd.DataFrame(scaler.fit_transform(merge_df), columns=merge_df.columns)

In [29]:
def smooth_and_calculate_distances(df, tickers):
    # Apply a 3-day moving average for smoothing
    for stock in ['GME'] + tickers:
        df[f'{stock}_mcap_smooth'] = df[f'Mcap_{stock}'].rolling(window=3).mean()
        df[f'{stock}_return_smooth'] = df[f'Return_{stock}'].rolling(window=3).mean()
        df[f'{stock}_volatility_smooth'] = df[f'Volatility_{stock}'].rolling(window=3).mean()
        
    total_differences = {}
    for ticker in tickers:
        df[f'total_diff_{ticker}'] = (abs(df['GME_mcap_smooth'] - df[f'{ticker}_mcap_smooth']) +
                                      abs(df['GME_return_smooth'] - df[f'{ticker}_return_smooth']) +
                                      abs(df['GME_volatility_smooth'] - df[f'{ticker}_volatility_smooth']))
        total_differences[ticker] = df[f'total_diff_{ticker}'].sum()
    closest_stocks = sorted(total_differences.items(), key=lambda x: x[1])[:3] #taking 10 closest stocks
    return closest_stocks

closest_stocks = smooth_and_calculate_distances(merge_df_scaled, tickers)
print("Top 3 closest stocks:", closest_stocks)

Top 3 closest stocks: [('CFGX', 43.82159425718064), ('ODP', 45.241845492543305), ('FCFS', 51.23813966305892), ('HGGG', 51.24204148678723), ('MRDH', 52.51732693229446)]
