### Using K-Means Algorithm to group the Top 25 Stocks in the SP500 in clusters based on their relative open and adjusted close prices. 

In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [14]:
# Import the top 25 sp500 stocks dataset into a Pandas Dataframe
csv_path = "../Resources/merged_top25_sp500_stock_data.csv"

stocks_df = pd.read_csv(csv_path,index_col='date',
    parse_dates=True,
    infer_datetime_format=True)

stocks_df = stocks_df.drop(columns=['Unnamed: 0', 'date_utc', 'open','high','low', 'close','volume'])
stocks_df = stocks_df.pivot( columns='symbol',values='adjclose')

# Drop na values 
stocks_df = stocks_df.dropna()

stocks_df

symbol,AAPL,ABBV,AMZN,BAC,BRK-B,CVX,GOOG,GOOGL,HD,JNJ,...,MRK,MSFT,NVDA,PEP,PFE,PG,TSLA,UNH,V,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,16.89,23.33,12.87,10.28,93.20,73.56,18.01,18.10,51.06,54.31,...,28.91,22.84,2.94,52.34,17.33,51.76,2.36,46.91,36.31,58.55
2013-01-03,16.68,23.13,12.92,10.22,93.62,73.25,18.02,18.11,50.91,54.23,...,29.61,22.53,2.94,52.36,17.29,51.43,2.32,44.72,36.34,58.45
2013-01-04,16.21,22.84,12.96,10.35,93.85,73.64,18.38,18.47,50.82,54.85,...,29.35,22.11,3.04,52.44,17.37,51.53,2.29,44.80,36.63,58.72
2013-01-07,16.12,22.89,13.42,10.33,93.45,73.14,18.30,18.39,50.54,54.74,...,29.46,22.07,2.95,52.43,17.38,51.18,2.29,44.80,36.89,58.04
2013-01-08,16.16,22.39,13.32,10.24,93.81,72.81,18.26,18.35,50.85,54.74,...,29.50,21.95,2.88,52.59,17.41,51.10,2.25,44.21,37.24,58.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-24,149.45,150.89,119.82,35.07,287.48,173.13,102.97,102.52,283.26,170.98,...,97.37,247.25,125.99,177.68,45.54,129.37,211.25,541.60,190.71,106.60
2022-10-25,152.34,149.82,120.60,35.39,289.24,174.93,104.93,104.48,290.26,170.71,...,97.71,250.66,132.61,178.27,45.59,130.86,222.42,540.22,194.38,105.88
2022-10-26,149.35,152.51,115.66,35.70,288.52,177.09,94.82,94.93,290.15,172.21,...,98.41,231.32,128.96,179.07,46.06,131.78,224.64,543.17,203.33,107.14
2022-10-27,144.80,153.50,110.96,35.87,289.88,177.90,92.60,92.22,291.06,172.31,...,99.74,226.75,131.76,178.88,45.74,131.88,225.09,541.80,204.29,107.55


In [15]:
# Plot your data to see what's in your DataFrame
stocks_df.hvplot.line(
    title = 'Performance of Top 25 SP500 Stocks over 10 years',
    width=800,
    height=400,
    rot=90
)

In [8]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaled_data = StandardScaler().fit_transform(stocks_df)

In [36]:
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(
    scaled_data,
    columns=stocks_df.columns
)

# Drop na values 
df_market_data_scaled = df_market_data_scaled.dropna()

# Copy the dates from the original data
df_market_data_scaled["date"] = stocks_df.index

# Set the dates column as index
df_market_data_scaled = df_market_data_scaled.set_index("date")

# Display sample data
df_market_data_scaled.head()

symbol,AAPL,ABBV,AMZN,BAC,BRK-B,CVX,GOOG,GOOGL,HD,JNJ,...,MRK,MSFT,NVDA,PEP,PFE,PG,TSLA,UNH,V,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,-0.861326,-1.381282,-1.107253,-1.265327,-1.530469,-0.820373,-1.134281,-1.154803,-1.286711,-1.765531,...,-1.618057,-1.007463,-0.805925,-1.545565,-1.40677,-1.233179,-0.668941,-1.208627,-1.317474,-0.223336
2013-01-03,-0.865661,-1.387419,-1.106324,-1.271451,-1.523516,-0.834205,-1.133998,-1.154515,-1.288392,-1.767938,...,-1.575823,-1.010892,-0.805925,-1.544953,-1.411239,-1.24396,-0.669335,-1.22472,-1.317009,-0.231901
2013-01-04,-0.875364,-1.396319,-1.105581,-1.258181,-1.519708,-0.816804,-1.123794,-1.144157,-1.289401,-1.749284,...,-1.59151,-1.015536,-0.804555,-1.542506,-1.402301,-1.240693,-0.669631,-1.224132,-1.312512,-0.208777
2013-01-07,-0.877222,-1.394784,-1.097033,-1.260223,-1.52633,-0.839113,-1.126062,-1.146459,-1.292539,-1.752594,...,-1.584873,-1.015979,-0.805788,-1.542812,-1.401184,-1.252128,-0.669631,-1.224132,-1.30848,-0.267015
2013-01-08,-0.876397,-1.410128,-1.098891,-1.26941,-1.52037,-0.853837,-1.127195,-1.14761,-1.289065,-1.752594,...,-1.58246,-1.017306,-0.806747,-1.537917,-1.397832,-1.254742,-0.670025,-1.228468,-1.303053,-0.236183


In [37]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1,11))

# Create an empy list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
for i in k: 
    k_model = KMeans(n_clusters=i)
    k_model.fit(df_market_data_scaled)
    inertia.append(k_model.inertia_)

# Create a dictionary with the data to plot the Elbow curve
elbow_data = {'k' : k, 'inertia' : inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_data)

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_df_1_plot = elbow_df.hvplot.line(
    x = 'k', 
    y = 'inertia',
    title = 'Elbow Curve'
)
elbow_df_1_plot

In [38]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=3)

# Fit the K-Means model using the scaled data
model.fit(stocks_df)

# Predict the clusters to group the stocks using the scaled data
k_3 = model.predict(stocks_df)

# View the resulting array of cluster values.
k_3

array([1, 1, 1, ..., 2, 2, 2])

In [39]:
# Create a copy of the DataFrame
market_data_predictions = df_market_data_scaled.copy()

In [40]:
# Add a new column to the DataFrame with the predicted clusters
market_data_predictions['market_clusters'] = k_3

# Display sample data
display(market_data_predictions.head())
display(market_data_predictions.tail())

symbol,AAPL,ABBV,AMZN,BAC,BRK-B,CVX,GOOG,GOOGL,HD,JNJ,...,MSFT,NVDA,PEP,PFE,PG,TSLA,UNH,V,XOM,market_clusters
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,-0.861326,-1.381282,-1.107253,-1.265327,-1.530469,-0.820373,-1.134281,-1.154803,-1.286711,-1.765531,...,-1.007463,-0.805925,-1.545565,-1.40677,-1.233179,-0.668941,-1.208627,-1.317474,-0.223336,1
2013-01-03,-0.865661,-1.387419,-1.106324,-1.271451,-1.523516,-0.834205,-1.133998,-1.154515,-1.288392,-1.767938,...,-1.010892,-0.805925,-1.544953,-1.411239,-1.24396,-0.669335,-1.22472,-1.317009,-0.231901,1
2013-01-04,-0.875364,-1.396319,-1.105581,-1.258181,-1.519708,-0.816804,-1.123794,-1.144157,-1.289401,-1.749284,...,-1.015536,-0.804555,-1.542506,-1.402301,-1.240693,-0.669631,-1.224132,-1.312512,-0.208777,1
2013-01-07,-0.877222,-1.394784,-1.097033,-1.260223,-1.52633,-0.839113,-1.126062,-1.146459,-1.292539,-1.752594,...,-1.015979,-0.805788,-1.542812,-1.401184,-1.252128,-0.669631,-1.224132,-1.30848,-0.267015,1
2013-01-08,-0.876397,-1.410128,-1.098891,-1.26941,-1.52037,-0.853837,-1.127195,-1.14761,-1.289065,-1.752594,...,-1.017306,-0.806747,-1.537917,-1.397832,-1.254742,-0.670025,-1.228468,-1.303053,-0.236183,1


symbol,AAPL,ABBV,AMZN,BAC,BRK-B,CVX,GOOG,GOOGL,HD,JNJ,...,MSFT,NVDA,PEP,PFE,PG,TSLA,UNH,V,XOM,market_clusters
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-24,1.875294,2.533212,0.880206,1.265133,1.68578,3.622272,1.273856,1.274087,1.315739,1.744823,...,1.474251,0.879607,2.2888,1.744893,1.302468,1.390664,2.426695,1.076667,3.891854,2
2022-10-25,1.934956,2.500377,0.894701,1.297797,1.714916,3.702585,1.329411,1.330479,1.394194,1.7367,...,1.511961,0.970287,2.306849,1.750479,1.351149,1.500797,2.416553,1.133574,3.830191,2
2022-10-26,1.873229,2.582926,0.8029,1.329441,1.702997,3.798961,1.04285,1.055711,1.392961,1.781831,...,1.298083,0.92029,2.331322,1.802988,1.381207,1.522686,2.438232,1.272353,3.938102,2
2022-10-27,1.779297,2.613307,0.71556,1.346794,1.725511,3.835101,0.979925,0.977741,1.40316,1.78484,...,1.247544,0.958644,2.32551,1.767237,1.384474,1.527123,2.428164,1.287239,3.973216,2
2022-10-28,2.005147,2.432557,0.575258,1.378437,1.88692,3.927908,1.092736,1.094841,1.488227,1.861865,...,1.348401,1.048777,2.427992,1.956046,1.493597,1.560942,2.497536,1.365545,4.242995,2


In [56]:
# Created a heatmap plot using hvPlot by grouping stocks based on adjclose prices
# Colored the graph points with the labels found using K-Means and added the stock symbol 
# in the `hover_cols` parameter to identify the stocks represented by each data point.

market_data_predictions_plot = market_data_predictions.hvplot.heatmap(
    title = 'KMeans Top 25 Stocks in SP500 Correlation Chart',
    hover_cols = 'symbol',
    by = 'market_clusters',
    height = 450, 
    width = 1000,
    rot = 90
)
market_data_predictions_plot

In [64]:
columns = ['AAPL', 'ABBV', 'AMZN', 'BAC', 'BRK-B', 'CVX', 'GOOG', 'GOOGL', 'HD', 'JNJ', 'JPM', 'KO', 'LLY', 'MA', 'META', 'MRK', 'MSFT', 'NVDA', 'PEP', 'PFE', 'PG', 'TSLA', 'UNH', 'V', 'XOM']

market_data_predictions_plot = market_data_predictions.hvplot.hist(
    title = 'KMeans Top 25 Stocks in SP500 Histogram Chart',
    hover_cols = 'symbol',
    y = columns,
    height = 450, 
    width = 1000,
    rot = 90
)
market_data_predictions_plot