# Smart_Stock_ML: NN Model Test

## Load the data

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import hvplot.pandas
import yfinance as yf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import joblib

#  Import completed external test dataset
test_stocks_df = pd.read_csv("data/Data_Predict.csv")
test_stocks_df.head()

Unnamed: 0,Ticker,Year,GICS Sector,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary
0,GCI,2015,Communication Services,M,1,<= 2 years,Virginia,"$7,740,000"
1,GCI,2016,Communication Services,M,0,<= 2 years,Virginia,"$7,740,000"
2,GCI,2017,Communication Services,M,0,2 to 5 years,Virginia,"$7,740,000"
3,GCI,2018,Communication Services,M,0,2 to 5 years,Virginia,"$7,740,000"
4,GCI,2019,Communication Services,F,1,<= 2 years,Virginia,"$7,740,000"


In [2]:
# View data info
test_stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Ticker                  54 non-null     object
 1   Year                    54 non-null     int64 
 2   GICS Sector             54 non-null     object
 3   Gender                  54 non-null     object
 4   CEO Transition          54 non-null     int64 
 5   Tenure Bucket           54 non-null     object
 6   HQ_US_State_or_Country  54 non-null     object
 7   Salary                  54 non-null     object
dtypes: int64(2), object(6)
memory usage: 3.5+ KB


## Run data transformations

In [3]:
# Extract salary number form string and convert to Integer
test_stocks_df['Salary'] = test_stocks_df['Salary'].replace('[\$,]', '', regex=True).astype(float).astype(int)
test_stocks_df.head()

Unnamed: 0,Ticker,Year,GICS Sector,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary
0,GCI,2015,Communication Services,M,1,<= 2 years,Virginia,7740000
1,GCI,2016,Communication Services,M,0,<= 2 years,Virginia,7740000
2,GCI,2017,Communication Services,M,0,2 to 5 years,Virginia,7740000
3,GCI,2018,Communication Services,M,0,2 to 5 years,Virginia,7740000
4,GCI,2019,Communication Services,F,1,<= 2 years,Virginia,7740000


In [4]:
# Create salary buckets
bin_edges = [0, 5000000, 10000000, 15000000, 20000000, 25000000, 50000000, 300000000]
test_stocks_df['Salary_Bucket'] = pd.cut(test_stocks_df['Salary'], bins=bin_edges)
test_stocks_df.head()

Unnamed: 0,Ticker,Year,GICS Sector,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary,Salary_Bucket
0,GCI,2015,Communication Services,M,1,<= 2 years,Virginia,7740000,"(5000000, 10000000]"
1,GCI,2016,Communication Services,M,0,<= 2 years,Virginia,7740000,"(5000000, 10000000]"
2,GCI,2017,Communication Services,M,0,2 to 5 years,Virginia,7740000,"(5000000, 10000000]"
3,GCI,2018,Communication Services,M,0,2 to 5 years,Virginia,7740000,"(5000000, 10000000]"
4,GCI,2019,Communication Services,F,1,<= 2 years,Virginia,7740000,"(5000000, 10000000]"


In [5]:
# Import annual metrics
annual_metrics_df = pd.read_csv('data/Annual_metrics.csv')
annual_metrics_df

Unnamed: 0,Year,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate
0,2015,0.11,0.1,5.3,2.9
1,2016,0.34,1.3,4.9,1.8
2,2017,0.65,2.1,4.4,2.5
3,2018,1.41,2.4,3.9,3.0
4,2019,2.4,1.8,3.7,2.5
5,2020,1.55,1.2,8.1,-2.2
6,2021,0.09,4.7,5.4,5.8
7,2022,0.08,8.0,3.6,1.9
8,2023,4.33,3.2,3.5,2.5


In [6]:
# Merge test stocks with annual metrics
test_stocks_metrics_df = pd.merge(test_stocks_df, annual_metrics_df, on='Year', how='left')
test_stocks_metrics_df = test_stocks_metrics_df.set_index('Ticker')
test_stocks_metrics_df.head()

Unnamed: 0_level_0,Year,GICS Sector,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary,Salary_Bucket,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GCI,2015,Communication Services,M,1,<= 2 years,Virginia,7740000,"(5000000, 10000000]",0.11,0.1,5.3,2.9
GCI,2016,Communication Services,M,0,<= 2 years,Virginia,7740000,"(5000000, 10000000]",0.34,1.3,4.9,1.8
GCI,2017,Communication Services,M,0,2 to 5 years,Virginia,7740000,"(5000000, 10000000]",0.65,2.1,4.4,2.5
GCI,2018,Communication Services,M,0,2 to 5 years,Virginia,7740000,"(5000000, 10000000]",1.41,2.4,3.9,3.0
GCI,2019,Communication Services,F,1,<= 2 years,Virginia,7740000,"(5000000, 10000000]",2.4,1.8,3.7,2.5


In [7]:
# Display merged df for the first year of data
test_stocks_metrics_df[test_stocks_metrics_df['Year'] == 2015]

Unnamed: 0_level_0,Year,GICS Sector,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary,Salary_Bucket,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GCI,2015,Communication Services,M,1,<= 2 years,Virginia,7740000,"(5000000, 10000000]",0.11,0.1,5.3,2.9
M,2015,Consumer Discretionary,M,0,10+ years,New York,11100000,"(10000000, 15000000]",0.11,0.1,5.3,2.9
GT,2015,Consumer Discretionary,M,0,10+ years,Ohio,18700000,"(15000000, 20000000]",0.11,0.1,5.3,2.9
GNRC,2015,Industrials,M,0,10+ years,Wisconsin,9500000,"(5000000, 10000000]",0.11,0.1,5.3,2.9
NYT,2015,Communication Services,M,0,2 to 5 years,New York,10280000,"(10000000, 15000000]",0.11,0.1,5.3,2.9
CDW,2015,Information Technology,M,0,5 to 10 years,Illinois,9800000,"(5000000, 10000000]",0.11,0.1,5.3,2.9


In [8]:
# View columns
test_stocks_metrics_df.columns

Index(['Year', 'GICS Sector', 'Gender', 'CEO Transition', 'Tenure Bucket',
       'HQ_US_State_or_Country', 'Salary', 'Salary_Bucket',
       'Interest Rate (Federal Funds Effective Rate)', 'CPI',
       'UnemploymentRate', 'GDPGrowthRate'],
      dtype='object')

In [9]:
# Create unique tickers list
unique_tickers_df = pd.DataFrame(test_stocks_df['Ticker'].unique(), columns=["Ticker"])
unique_tickers_df

Unnamed: 0,Ticker
0,GCI
1,M
2,GT
3,GNRC
4,NYT
5,CDW


In [10]:
# Extract SP 500 stock data for each test ticker and year available from yfinance
bad_tickers = []

# Setup years and df
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
yearly_stocks_df = pd.DataFrame(columns=['Ticker', 'Year', 'Year_Open', 'Year_High', 'Year_Low', 'Year_Close', 'Year_Volume'])
good_years = 0

# Loop through unique tickers
for index,row in unique_tickers_df.iterrows():
    try:
        # Extract monthly data
        ticker = row["Ticker"]
        stock_data = yf.download(ticker, start="2015-01-01", end="2024-01-01", interval="1mo")
        # Save to a temporary dataframe
        stock_df = pd.DataFrame(stock_data).reset_index()
        
        # Loop through all years in dataframe
        for year in years:
            # Extract the year
            year_stocks = stock_df.loc[stock_df['Date'].dt.year == year].reset_index(drop=True)
            # If year exists calculate Year values and save to a dictionary
            if year_stocks.empty:
                continue
            else:
                new_year = {"Ticker": ticker,
                            "Year": [year_stocks["Date"].iloc[0].year],
                            "Year_Open": [year_stocks["Open"].iloc[0]],
                            "Year_High": [year_stocks["High"].max()],
                            "Year_Low": [year_stocks["Low"].min()],
                            "Year_Close": [year_stocks["Close"].iloc[-1]],
                            "Year_Volume": [year_stocks["Volume"].sum()]
                   }
                # Create a new dataframe from the dictionary and concatenate to master test stock dataframe
                new_year_df = pd.DataFrame(new_year)
                good_years += 1
                yearly_stocks_df = pd.concat([yearly_stocks_df, new_year_df], ignore_index=True)
        
    except Exception as e:
        print(f'Error occured in stock download: {e}')
        bad_tickers.append(ticker)

[*********************100%%**********************]  1 of 1 completed
  yearly_stocks_df = pd.concat([yearly_stocks_df, new_year_df], ignore_index=True)
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [11]:
# Calculate stock CAGR
yearly_stocks_df['Stock_CAGR'] = yearly_stocks_df.apply(lambda row: (row['Year_Close']/row['Year_Open']) ** (1/1) - 1, axis=1)
yearly_stocks_df.head()

Unnamed: 0,Ticker,Year,Year_Open,Year_High,Year_Low,Year_Close,Year_Volume,Stock_CAGR
0,GCI,2015,23.67,25.77,13.96,19.459999,94600800,-0.177862
1,GCI,2016,19.059999,19.889999,13.95,15.99,101618100,-0.16107
2,GCI,2017,16.049999,17.620001,11.87,16.780001,89112900,0.045483
3,GCI,2018,16.799999,19.1,10.88,11.57,111696900,-0.31131
4,GCI,2019,11.43,14.11,5.85,6.38,288853400,-0.44182


In [15]:
# Import S&P500 data
sp500_df = pd.read_csv('data/SP500.csv')

In [16]:
# Sort by date
sp500_df = sp500_df.sort_values('Date').reset_index(drop=True)

In [17]:
# Convert date to datetime format
sp500_df['Date'] = pd.to_datetime(sp500_df['Date'])

In [18]:
# Calculate values for each year
yearly_sp500_df = pd.DataFrame(columns=['Year', 'SP500_Open', 'SP500_High', 'SP500_Low', 'SP500_Close'])


for year in years:
    year_sp500 = sp500_df.loc[sp500_df['Date'].dt.year == year].reset_index(drop=True)
    new_sp500_year = {"Year": [year_sp500["Date"].iloc[0].year],
                "SP500_Open": [year_sp500["Open"].iloc[0]],
                "SP500_High": [year_sp500["High"].max()],
                "SP500_Low": [year_sp500["Low"].min()],
                "SP500_Close": [year_sp500["Close"].iloc[-1]]
               }
    new_sp500_year_df = pd.DataFrame(new_sp500_year)
    yearly_sp500_df = pd.concat([yearly_sp500_df, new_sp500_year_df], ignore_index=True)

  yearly_sp500_df = pd.concat([yearly_sp500_df, new_sp500_year_df], ignore_index=True)


In [19]:
# Calculate S&P500 CAGR
yearly_sp500_df['SP500_CAGR'] = yearly_sp500_df.apply(lambda row: (row['SP500_Close']/row['SP500_Open']) ** (1/1) - 1, axis=1)

In [20]:
# Merge stock and S&P500 datasets
test_stock_and_sp500_df = pd.merge(yearly_stocks_df, yearly_sp500_df, on='Year', how='left')

In [21]:
# View data
test_stock_and_sp500_df.head()

Unnamed: 0,Ticker,Year,Year_Open,Year_High,Year_Low,Year_Close,Year_Volume,Stock_CAGR,SP500_Open,SP500_High,SP500_Low,SP500_Close,SP500_CAGR
0,GCI,2015,23.67,25.77,13.96,19.459999,94600800,-0.177862,2058.9,2134.72,1867.01,2043.94,-0.007266
1,GCI,2016,19.059999,19.889999,13.95,15.99,101618100,-0.16107,2038.2,2277.53,1810.1,2238.83,0.098435
2,GCI,2017,16.049999,17.620001,11.87,16.780001,89112900,0.045483,2251.57,2694.97,2245.13,2673.61,0.187443
3,GCI,2018,16.799999,19.1,10.88,11.57,111696900,-0.31131,2683.73,2940.91,2346.58,2506.85,-0.065908
4,GCI,2019,11.43,14.11,5.85,6.38,288853400,-0.44182,2476.96,3247.93,2443.96,3230.78,0.304333


In [22]:
# Calculate if stock outperformed S&P500 for each year
test_stock_and_sp500_df['Outperformed?'] = test_stock_and_sp500_df.apply(
    lambda row: 1 if row['Stock_CAGR'] > row['SP500_CAGR'] else 0, axis=1
)
        
test_stock_and_sp500_df.head()

Unnamed: 0,Ticker,Year,Year_Open,Year_High,Year_Low,Year_Close,Year_Volume,Stock_CAGR,SP500_Open,SP500_High,SP500_Low,SP500_Close,SP500_CAGR,Outperformed?
0,GCI,2015,23.67,25.77,13.96,19.459999,94600800,-0.177862,2058.9,2134.72,1867.01,2043.94,-0.007266,0
1,GCI,2016,19.059999,19.889999,13.95,15.99,101618100,-0.16107,2038.2,2277.53,1810.1,2238.83,0.098435,0
2,GCI,2017,16.049999,17.620001,11.87,16.780001,89112900,0.045483,2251.57,2694.97,2245.13,2673.61,0.187443,0
3,GCI,2018,16.799999,19.1,10.88,11.57,111696900,-0.31131,2683.73,2940.91,2346.58,2506.85,-0.065908,0
4,GCI,2019,11.43,14.11,5.85,6.38,288853400,-0.44182,2476.96,3247.93,2443.96,3230.78,0.304333,0


## Preprocessing

In [23]:
# Convert categorical data to numeric with `pd.get_dummies`
X_test = pd.get_dummies(test_stocks_metrics_df, dtype=int).drop(columns=['Gender_M', 'Salary'], axis=1).reset_index()
X_test.head()

Unnamed: 0,Ticker,Year,CEO Transition,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate,GICS Sector_Communication Services,GICS Sector_Consumer Discretionary,GICS Sector_Industrials,...,HQ_US_State_or_Country_Ohio,HQ_US_State_or_Country_Virginia,HQ_US_State_or_Country_Wisconsin,"Salary_Bucket_(0, 5000000]","Salary_Bucket_(5000000, 10000000]","Salary_Bucket_(10000000, 15000000]","Salary_Bucket_(15000000, 20000000]","Salary_Bucket_(20000000, 25000000]","Salary_Bucket_(25000000, 50000000]","Salary_Bucket_(50000000, 300000000]"
0,GCI,2015,1,0.11,0.1,5.3,2.9,1,0,0,...,0,1,0,0,1,0,0,0,0,0
1,GCI,2016,0,0.34,1.3,4.9,1.8,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,GCI,2017,0,0.65,2.1,4.4,2.5,1,0,0,...,0,1,0,0,1,0,0,0,0,0
3,GCI,2018,0,1.41,2.4,3.9,3.0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
4,GCI,2019,1,2.4,1.8,3.7,2.5,1,0,0,...,0,1,0,0,1,0,0,0,0,0


In [24]:
# View columns
X_test.columns

Index(['Ticker', 'Year', 'CEO Transition',
       'Interest Rate (Federal Funds Effective Rate)', 'CPI',
       'UnemploymentRate', 'GDPGrowthRate',
       'GICS Sector_Communication Services',
       'GICS Sector_Consumer Discretionary', 'GICS Sector_Industrials',
       'GICS Sector_Information Technology', 'Gender_F',
       'Tenure Bucket_10+ years', 'Tenure Bucket_2 to 5 years',
       'Tenure Bucket_5 to 10 years', 'Tenure Bucket_<= 2 years',
       'HQ_US_State_or_Country_Illinois', 'HQ_US_State_or_Country_New York',
       'HQ_US_State_or_Country_Ohio', 'HQ_US_State_or_Country_Virginia',
       'HQ_US_State_or_Country_Wisconsin', 'Salary_Bucket_(0, 5000000]',
       'Salary_Bucket_(5000000, 10000000]',
       'Salary_Bucket_(10000000, 15000000]',
       'Salary_Bucket_(15000000, 20000000]',
       'Salary_Bucket_(20000000, 25000000]',
       'Salary_Bucket_(25000000, 50000000]',
       'Salary_Bucket_(50000000, 300000000]'],
      dtype='object')

In [25]:
# Add missing columns to match model feature size
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
missing_cols_df = pd.DataFrame(columns=['Year', 'GICS Sector_Consumer Staples',
       'GICS Sector_Energy', 'GICS Sector_Financials',
       'GICS Sector_Health Care', 'GICS Sector_Materials',
       'GICS Sector_Real Estate', 'GICS Sector_Utilities',
       'HQ_US_State_or_Country_Alabama', 'HQ_US_State_or_Country_Arizona',
       'HQ_US_State_or_Country_Arkansas', 'HQ_US_State_or_Country_Bermuda',
       'HQ_US_State_or_Country_Bristol', 'HQ_US_State_or_Country_California',
       'HQ_US_State_or_Country_Canada', 'HQ_US_State_or_Country_Colorado',
       'HQ_US_State_or_Country_Connecticut', 'HQ_US_State_or_Country_D.C.',
       'HQ_US_State_or_Country_Delaware', 'HQ_US_State_or_Country_Florida',
       'HQ_US_State_or_Country_Georgia', 'HQ_US_State_or_Country_Idaho',
       'HQ_US_State_or_Country_Illinois; Pittsburgh',
       'HQ_US_State_or_Country_Indiana', 'HQ_US_State_or_Country_Iowa',
       'HQ_US_State_or_Country_Ireland', 'HQ_US_State_or_Country_Kentucky',
       'HQ_US_State_or_Country_Louisiana', 'HQ_US_State_or_Country_Maine',
       'HQ_US_State_or_Country_Maryland',
       'HQ_US_State_or_Country_Massachusetts',
       'HQ_US_State_or_Country_Michigan', 'HQ_US_State_or_Country_Minnesota',
       'HQ_US_State_or_Country_Missouri', 'HQ_US_State_or_Country_Montana',
       'HQ_US_State_or_Country_Nebraska', 'HQ_US_State_or_Country_Netherlands',
       'HQ_US_State_or_Country_Nevada', 'HQ_US_State_or_Country_New Jersey',
       'HQ_US_State_or_Country_North Carolina',
       'HQ_US_State_or_Country_Ohio; Detroit',
       'HQ_US_State_or_Country_Oklahoma', 'HQ_US_State_or_Country_Oregon',
       'HQ_US_State_or_Country_Pennsylvania',
       'HQ_US_State_or_Country_Rhode Island',
       'HQ_US_State_or_Country_Switzerland',
       'HQ_US_State_or_Country_Tennessee', 'HQ_US_State_or_Country_Texas',
       'HQ_US_State_or_Country_UK', 'HQ_US_State_or_Country_United Kingdom',
       'HQ_US_State_or_Country_Utah',
       'HQ_US_State_or_Country_Washington'])
missing_cols_df['Year'] = years

missing_cols_df.fillna(0, inplace=True)
missing_cols_df

Unnamed: 0,Year,GICS Sector_Consumer Staples,GICS Sector_Energy,GICS Sector_Financials,GICS Sector_Health Care,GICS Sector_Materials,GICS Sector_Real Estate,GICS Sector_Utilities,HQ_US_State_or_Country_Alabama,HQ_US_State_or_Country_Arizona,...,HQ_US_State_or_Country_Oregon,HQ_US_State_or_Country_Pennsylvania,HQ_US_State_or_Country_Rhode Island,HQ_US_State_or_Country_Switzerland,HQ_US_State_or_Country_Tennessee,HQ_US_State_or_Country_Texas,HQ_US_State_or_Country_UK,HQ_US_State_or_Country_United Kingdom,HQ_US_State_or_Country_Utah,HQ_US_State_or_Country_Washington
0,2015,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2016,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2017,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2019,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Merge test data and missing columns
X_test = pd.merge(X_test, missing_cols_df, on='Year', how='left').set_index('Ticker')

In [27]:
# View data
X_test.head()

Unnamed: 0_level_0,Year,CEO Transition,Interest Rate (Federal Funds Effective Rate),CPI,UnemploymentRate,GDPGrowthRate,GICS Sector_Communication Services,GICS Sector_Consumer Discretionary,GICS Sector_Industrials,GICS Sector_Information Technology,...,HQ_US_State_or_Country_Oregon,HQ_US_State_or_Country_Pennsylvania,HQ_US_State_or_Country_Rhode Island,HQ_US_State_or_Country_Switzerland,HQ_US_State_or_Country_Tennessee,HQ_US_State_or_Country_Texas,HQ_US_State_or_Country_UK,HQ_US_State_or_Country_United Kingdom,HQ_US_State_or_Country_Utah,HQ_US_State_or_Country_Washington
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCI,2015,1,0.11,0.1,5.3,2.9,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCI,2016,0,0.34,1.3,4.9,1.8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCI,2017,0,0.65,2.1,4.4,2.5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCI,2018,0,1.41,2.4,3.9,3.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCI,2019,1,2.4,1.8,3.7,2.5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# View test data shape
X_test.shape

(54, 78)

In [29]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler instance
X_scaler = scaler.fit(X_test)

# Scale the data
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Load the saved best model
loaded_model = load_model("best_optimized_model.keras")

# Check model architecture and summary
loaded_model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


## Load the model and make predictions

In [31]:
# Make predictions using the loaded model
predictions = loaded_model.predict(X_test_scaled)

# Convert probabilities of classification model to binary labels
predicted_classes = (predictions > 0.5).astype("int32")

# Print the predictions
print(predicted_classes[0:10])

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[[0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]]


In [32]:
# Copy test stock dataframe
test_results = test_stocks_metrics_df.copy()

In [33]:
# Add new columns with preidction results
test_results["Predictions"] = predicted_classes
test_results = test_results.reset_index()

In [34]:
# Merge results into main dataframe
predictions_outcomes_df = pd.merge(test_results, test_stock_and_sp500_df, on=['Year', 'Ticker'], how='left').set_index('Ticker')

In [35]:
# View columns
predictions_outcomes_df.columns

Index(['Year', 'GICS Sector', 'Gender', 'CEO Transition', 'Tenure Bucket',
       'HQ_US_State_or_Country', 'Salary', 'Salary_Bucket',
       'Interest Rate (Federal Funds Effective Rate)', 'CPI',
       'UnemploymentRate', 'GDPGrowthRate', 'Predictions', 'Year_Open',
       'Year_High', 'Year_Low', 'Year_Close', 'Year_Volume', 'Stock_CAGR',
       'SP500_Open', 'SP500_High', 'SP500_Low', 'SP500_Close', 'SP500_CAGR',
       'Outperformed?'],
      dtype='object')

In [36]:
# Select desired columns for final predictions dataframe
predictions_outcomes_df = predictions_outcomes_df[['Year', 'GICS Sector', 'Gender', 'Predictions','Outperformed?']]
predictions_outcomes_df.head()

Unnamed: 0_level_0,Year,GICS Sector,Gender,Predictions,Outperformed?
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCI,2015,Communication Services,M,0,0
GCI,2016,Communication Services,M,1,0
GCI,2017,Communication Services,M,1,0
GCI,2018,Communication Services,M,1,0
GCI,2019,Communication Services,F,0,0


## Model results

In [37]:
# Run confusion matrix and classification report
cm = confusion_matrix(predictions_outcomes_df['Outperformed?'], predictions_outcomes_df['Predictions'])
cm_df = pd.DataFrame(cm, index=["Actual < SP500", "Actual > SP500"], columns=["Predicted < SP500", "Predicted > SP500"])
print(f"----------------------------------------------")
print(f"Confusion Matrix: NN model test")
display(cm_df)

target_names = ["Less than SP500 CAGR", "Outperformed SP500 CAGR"]

report_data = []
report = classification_report(predictions_outcomes_df['Outperformed?'], predictions_outcomes_df['Predictions'], output_dict=True, target_names=target_names)
results = {
    "Less than SP500 CAGR": report["Less than SP500 CAGR"],
    "Outperformed SP500 CAGR": report["Outperformed SP500 CAGR"],
    "accuracy": report["accuracy"],
    "macro avg": report["macro avg"],
    "weighted avg": report["weighted avg"]
}
# Save report results and display as a dataframe
report_data.append({**results})
results_df = pd.DataFrame(results).transpose().round(3)
results_df = results_df.astype(object)
results_df.loc["accuracy", results_df.columns != 'f1-score'] = ""
print(f"Classification report: NN Model test")
display(results_df)

----------------------------------------------
Confusion Matrix: NN model test


Unnamed: 0,Predicted < SP500,Predicted > SP500
Actual < SP500,11,16
Actual > SP500,11,16


Classification report: NN Model test


Unnamed: 0,precision,recall,f1-score,support
Less than SP500 CAGR,0.5,0.407,0.449,27.0
Outperformed SP500 CAGR,0.5,0.593,0.542,27.0
accuracy,,,0.5,
macro avg,0.5,0.5,0.496,54.0
weighted avg,0.5,0.5,0.496,54.0
