# Training and Testing the ML Model

### Setup i.e. Inputs 

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

In [None]:
tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]['Symbol']
tickers = tickers.to_list()

In [None]:
df = yf.download(tickers,start= '2010-01-01')

### Process i.e. Funcaitons

In [None]:
def slice_by_ticker(ticker,df):
    unpivot = df.copy()
    unpivot = unpivot[unpivot.columns[unpivot.columns.get_level_values(1) == ticker]]
    unpivot.columns = unpivot.columns.droplevel(1)
    unpivot['Price'] = unpivot['Open'].shift(-1)
    return unpivot

In [None]:
def MA_calc(df,n,m):
    df[f'MA{n}'] = df['Adj Close'].rolling(n).mean()
    df[f'MA{m}'] = df['Adj Close'].rolling(m).mean()
    df.dropna(inplace= True)
    return df

In [None]:
cumrets = []

def vectorised (ticker,df,n,m):
    #create sub data frames based on the ticker syboml and feed that 
    #to moving average function that adds the 2 moving averages to dhte df
    z = pd.DataFrame(MA_calc(slice_by_ticker(ticker, df),n,m))

    #find the first signal that's a buy 
    first_signal = pd.Series(z.index == (z.iloc[:,-2] > z.iloc[:,-1]).idxmax(), index= z.index)

    #find the rest of the signals 
    Trade_signals = first_signal | (z.iloc[:,-2] > z.iloc[:,-1]).diff()
    Trades = z[Trade_signals]
    if len(Trades) % 2 != 0:
        temp = z.tail(1).copy()
        temp['Price'] = temp['Close']
        Trades = pd.concat([Trades, temp])

    #calculate returns and cumulated return
    Rets = Trades['Price'].diff()[1::2] / Trades['Price'][0::2].values
    cumret = (Rets + 1).prod()
    return cumret


In [None]:
cumrets = []
clean_tickers = []

for ticker in tickers:
    try:
        x = vectorised(ticker, df, 10, 50)
#        print(f"Cumuret for {ticker}: {x}")
        cumrets.append(x)
        clean_tickers.append(ticker)
    except ValueError:
        print(f'{ticker} one Goofy ahh stock')



### Outputs 

In [None]:
compare = pd.DataFrame({'Results': cumrets },index= clean_tickers)
compare

In [None]:
compare['Results'].nlargest(5)

### Trainign and Testing the model

In [27]:
trianing_data= df[:int(len(df)*0.7)]
testing_data= df[int(len(df)*0.7):]

cumrets_training = []
cumrets_testing = []
clean_tickers = []

for ticker in tickers:
    try:
        x = vectorised(ticker,trianing_data,10,50)
        y = vectorised(ticker,testing_data,10,50) 
        cumrets_training.append(x)
        cumrets_testing.append(y)
        clean_tickers.append(ticker)
    except ValueError:
        print(f"{ticker} is one Goofy ahh stock")


ABNB is one Goofy ahh stock
BRK.B is one Goofy ahh stock
BF.B is one Goofy ahh stock
CARR is one Goofy ahh stock
CEG is one Goofy ahh stock
GEHC is one Goofy ahh stock
GEV is one Goofy ahh stock
KVUE is one Goofy ahh stock
OTIS is one Goofy ahh stock
SW is one Goofy ahh stock
SOLV is one Goofy ahh stock
VLTO is one Goofy ahh stock


In [29]:
compare = pd.DataFrame({'Training Results':cumrets_training,
                        'Testing Results':cumrets_testing}, index= clean_tickers)
compare

Unnamed: 0,Training Results,Testing Results
MMM,1.387678,0.723151
AOS,1.767926,1.189897
ABT,1.475640,1.094646
ABBV,1.669231,1.486111
ACN,1.434463,1.206884
...,...,...
XYL,1.308778,1.665125
YUM,1.880603,0.946238
ZBRA,3.377136,1.215614
ZBH,1.386050,0.639163


### model looks to over-estimate the actual return by an order of magnitude 

In [32]:
compare.nlargest(5,'Training Results')

Unnamed: 0,Training Results,Testing Results
TYL,11.040408,0.966137
CSGP,10.612549,0.551288
NFLX,10.559905,1.558895
URI,9.797369,2.985338
FTNT,9.499673,1.017777


In [36]:

compare.nlargest(5,'Training Results').mean(0)

Training Results    10.301981
Testing Results      1.415887
dtype: float64