# Unsupervised Learning Trading Strategy

Your KMeans clustering is totally useless when you're not normalizing the RSI values. In essence all the other features are just small noise in the clustering and you end up making the wrong conclusion that RSI is the main feature driving the clustering in your data.

In [1]:
from statsmodels.regression.rolling import RollingOLS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime as dt
import yfinance as yf
import pandas_datareader.data as web
import pandas_ta
import warnings
warnings.filterwarnings('ignore')
import time


Getting the companies' symbols from Wiki.
Caution : not survivalship bias free

In [2]:
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500.head()


Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [3]:
sp500.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Symbol                 503 non-null    object
 1   Security               503 non-null    object
 2   GICS Sector            503 non-null    object
 3   GICS Sub-Industry      503 non-null    object
 4   Headquarters Location  503 non-null    object
 5   Date added             503 non-null    object
 6   CIK                    503 non-null    int64 
 7   Founded                503 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.6+ KB


replacing `.` with `-`

In [4]:
sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')


In [5]:
symbols_list = sp500['Symbol'].unique().tolist()
symbols_list[:10]


['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADM', 'ADBE', 'ADP', 'AES', 'AFL']

defining time limits

In [6]:
end_date = '2023-11-18' #'2023-09-27'
start_date = pd.to_datetime(end_date) - pd.DateOffset(365*8) #'2023-10-27'
start_date


Timestamp('2015-11-20 00:00:00')

In [7]:
original_df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date)

#time.sleep(1)
original_df.head()


[*********************100%***********************]  503 of 503 completed


Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-11-20 00:00:00-05:00,36.7747,40.328243,27.152472,43.255482,,39.607716,24.686666,95.005859,91.809998,50.314316,...,3778300,3883400,10144400,880900,1300900,5928998,1692187,303500,2092400,3679400
2015-11-23 00:00:00-05:00,37.47686,40.423801,26.799694,43.283794,,39.117043,24.643333,94.891487,91.959999,48.095818,...,2427400,2341500,11868400,616000,870600,3554005,1031648,320300,1438700,3265300
2015-11-24 00:00:00-05:00,38.076046,39.401268,27.056885,43.269634,,39.125645,24.533333,94.460297,92.0,51.161068,...,5628300,4416300,15055500,917900,1044000,3045455,1313868,431800,1608600,1713500
2015-11-25 00:00:00-05:00,38.703312,39.477711,26.863422,42.675053,,39.091217,24.216667,94.240334,91.769997,51.017117,...,2358500,2200100,8980400,365000,916200,3222808,1008576,292100,1086200,2196800
2015-11-27 00:00:00-05:00,39.293114,39.840858,26.813353,42.469788,,39.099819,24.306667,94.557098,92.169998,51.186481,...,2285100,1468000,4156600,293200,514200,2558884,917730,175700,763200,1155100


In [9]:
# getting rid of multi index
df = original_df.stack()
df


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-11-20 00:00:00-05:00,A,36.774700,39.279999,39.349998,38.520000,38.779999,5447900.0
2015-11-20 00:00:00-05:00,AAL,40.328243,42.200001,42.709999,42.049999,42.400002,5656000.0
2015-11-20 00:00:00-05:00,AAPL,27.152472,29.825001,29.980000,29.712500,29.799999,137148400.0
2015-11-20 00:00:00-05:00,ABBV,43.255482,61.110001,61.360001,60.570000,60.939999,7490200.0
2015-11-20 00:00:00-05:00,ABT,39.607716,46.009998,46.380001,45.840000,46.029999,7140700.0
...,...,...,...,...,...,...,...
2023-11-17 00:00:00-05:00,YUM,127.660004,127.660004,128.490005,127.250000,128.419998,1089600.0
2023-11-17 00:00:00-05:00,ZBH,111.669998,111.669998,112.660004,111.330002,112.349998,2991400.0
2023-11-17 00:00:00-05:00,ZBRA,218.020004,218.020004,218.699997,215.270004,218.460007,340600.0
2023-11-17 00:00:00-05:00,ZION,36.070000,36.070000,36.470001,35.400002,36.250000,2136200.0


In [11]:
df.index.names = ['date','ticker']
df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-11-20 00:00:00-05:00,A,36.7747,39.279999,39.349998,38.52,38.779999,5447900.0
2015-11-20 00:00:00-05:00,AAL,40.328243,42.200001,42.709999,42.049999,42.400002,5656000.0
2015-11-20 00:00:00-05:00,AAPL,27.152472,29.825001,29.98,29.7125,29.799999,137148400.0
2015-11-20 00:00:00-05:00,ABBV,43.255482,61.110001,61.360001,60.57,60.939999,7490200.0
2015-11-20 00:00:00-05:00,ABT,39.607716,46.009998,46.380001,45.84,46.029999,7140700.0


In [12]:
df.columns = df.columns.str.lower()
df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-11-20 00:00:00-05:00,A,36.7747,39.279999,39.349998,38.52,38.779999,5447900.0
2015-11-20 00:00:00-05:00,AAL,40.328243,42.200001,42.709999,42.049999,42.400002,5656000.0
2015-11-20 00:00:00-05:00,AAPL,27.152472,29.825001,29.98,29.7125,29.799999,137148400.0
2015-11-20 00:00:00-05:00,ABBV,43.255482,61.110001,61.360001,60.57,60.939999,7490200.0
2015-11-20 00:00:00-05:00,ABT,39.607716,46.009998,46.380001,45.84,46.029999,7140700.0


In [13]:
#saving data for futher tests and avoiding being blocked
df.to_csv('assets_stack.csv')


## Calculate features and technical indicators for each stock


* garman-klass volatility
* RSI
* Bollinger Bands
* ATR
* MACD
* Dollar Volume

$$ \sigma_{GK} = \sqrt{\frac{1}{N-1} \sum_{i=1}^{N} \left(\log\left(\frac{H_i}{L_i}\right)^2 - \frac{2\log\left(\frac{C_i}{O_i}\right)\log\left(\frac{H_i}{L_i}\right)}{N} + \log\left(\frac{C_i}{O_i}\right)^2\right)}


* N is the number of data points i
* H\_i  is the high price of the asset at time i
* L\_i is the low price of the asset at time i
* O\_i is the open price of the asset at time i
* C\_i is the close price of the asset at time i

In [42]:
df['garman_klass_vol'] = ((np.log(df.high) - np.log(df.low))**2)/2 - (2*np.log(2)-1)*(np.log(df['adj close']) - np.log(df.open))**2
df


Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,close,high,low,open,volume,garman_klass_vol
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-09-29 00:00:00-04:00,A,31.588039,33.740002,34.060001,33.240002,33.360001,2252400.0,-0.000854
2015-09-29 00:00:00-04:00,AAL,37.361618,39.180000,39.770000,38.790001,39.049999,7478800.0,-0.000443
2015-09-29 00:00:00-04:00,AAPL,24.716068,27.264999,28.377501,26.965000,28.207500,293461600.0,-0.005441
2015-09-29 00:00:00-04:00,ABBV,37.024612,52.790001,54.189999,51.880001,53.099998,12842800.0,-0.049280
2015-09-29 00:00:00-04:00,ABT,33.807266,39.500000,40.150002,39.029999,39.259998,12287500.0,-0.008237
...,...,...,...,...,...,...,...,...
2023-09-26 00:00:00-04:00,YUM,124.010002,124.010002,124.739998,123.449997,124.239998,1500600.0,0.000053
2023-09-26 00:00:00-04:00,ZBH,112.216316,112.459999,117.110001,112.419998,116.769997,3610500.0,0.000224
2023-09-26 00:00:00-04:00,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0,0.000133
2023-09-26 00:00:00-04:00,ZION,33.581326,33.990002,34.700001,33.840000,33.840000,1586100.0,0.000292


In [48]:
df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))


In [51]:
# normalizing indicator to lower/middle/high band
pandas_ta.bbands(close=df.xs('AAPL', level=1)['adj close'], length=20)


Unnamed: 0_level_0,BBL_20_2.0,BBM_20_2.0,BBU_20_2.0,BBB_20_2.0,BBP_20_2.0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-09-29 00:00:00-04:00,,,,,
2015-09-30 00:00:00-04:00,,,,,
2015-10-01 00:00:00-04:00,,,,,
2015-10-02 00:00:00-04:00,,,,,
2015-10-05 00:00:00-04:00,,,,,
...,...,...,...,...,...
2023-09-20 00:00:00-04:00,170.438581,180.107718,189.776855,10.737060,0.249274
2023-09-21 00:00:00-04:00,169.695552,179.748691,189.801830,11.185772,0.199222
2023-09-22 00:00:00-04:00,169.479507,179.669295,189.859083,11.342826,0.249294
2023-09-25 00:00:00-04:00,169.230209,179.542961,189.855713,11.487782,0.320871


In [55]:
df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=np.log1p(x), length=20).iloc[:,0])
df


IndexingError: Too many indexers

In [None]:
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=np.log1p(x), length=20)).iloc[:,1]
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=np.log1p(x), length=20)).iloc[:,2]
