In [3]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [4]:
# Import the top 25 sp500 stocks dataset into a Pandas Dataframe
stocks_df = pd.read_csv(
    Path("../Resources/merged_top25_sp500_stock_data.csv"), 
    index_col='symbol', 
    infer_datetime_format=True, 
    parse_dates=True
)

# stocks_df = stocks_df.sort_values(by=['date'], ascending = True)

# drop un-necessary columns
stocks_df = stocks_df.loc[:, ~stocks_df.columns.str.contains('^Unnamed')]
stocks_df.drop(['date', 'high', 'low', 'close', 'volume', 'date_utc'], axis=1, inplace=True)

stocks_df['adjclose'] = stocks_df['adjclose'].pct_change()
stocks_df['open'] = stocks_df['open'].pct_change()
stocks_df = stocks_df.dropna()

# Review the DataFrame
stocks_df.info()
stocks_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 62882 entries, AAPL to XOM
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   open      62882 non-null  float64
 1   adjclose  62882 non-null  float64
dtypes: float64(2)
memory usage: 1.4+ MB


Unnamed: 0_level_0,open,adjclose
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,0.005176,0.002194
AAPL,-0.003745,-0.033388
AAPL,-0.020677,0.01359
AAPL,0.011516,-0.002793
AAPL,-0.027989,-0.038095


In [5]:
# Plot your data to see what's in your DataFrame
stocks_df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaled_data = StandardScaler().fit_transform(stocks_df)

In [7]:
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(
    scaled_data,
    columns=stocks_df.columns
)

# Copy the crypto names from the original data
df_market_data_scaled["symbol"] = stocks_df.index

# Set the coinid column as index
df_market_data_scaled = df_market_data_scaled.set_index("symbol")

# Display sample data
df_market_data_scaled.head()

Unnamed: 0_level_0,open,adjclose
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,0.193942,0.065112
AAPL,-0.180872,-1.40161
AAPL,-0.892178,0.534856
AAPL,0.460286,-0.140475
AAPL,-1.19936,-1.595642


In [8]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1,11))

# Create an empy list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k: 
    k_model = KMeans(n_clusters=i)
    k_model.fit(df_market_data_scaled)
    inertia.append(k_model.inertia_)

# Create a dictionary with the data to plot the Elbow curve
elbow_data = {'k' : k, 'inertia' : inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame(elbow_data)

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_df_1_plot = elbow_df.hvplot.line(
    x = 'k', 
    y = 'inertia',
    title = 'Elbow Curve'
)
elbow_df_1_plot

In [9]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

# Fit the K-Means model using the scaled data
model.fit(stocks_df)

# Predict the clusters to group the cryptocurrencies using the scaled data
k_4 = model.predict(stocks_df)

# View the resulting array of cluster values.
k_4

array([0, 3, 0, ..., 0, 0, 2])

In [10]:
# Create a copy of the DataFrame
market_data_predictions = stocks_df.copy()

In [11]:
# Add a new column to the DataFrame with the predicted clusters
market_data_predictions['market_clusters'] = k_4

# Display sample data
display(market_data_predictions.head())
display(market_data_predictions.tail())

Unnamed: 0_level_0,open,adjclose,market_clusters
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,0.005176,0.002194,0
AAPL,-0.003745,-0.033388,3
AAPL,-0.020677,0.01359,0
AAPL,0.011516,-0.002793,0
AAPL,-0.027989,-0.038095,3


Unnamed: 0_level_0,open,adjclose,market_clusters
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XOM,0.011584,0.00699,0
XOM,0.007098,-0.006754,0
XOM,-0.00141,0.0119,0
XOM,0.020233,0.003827,0
XOM,0.009685,0.029289,2


In [13]:
# Created a scatter plot using hvPlot by grouping stocks based on open & close prices 
# Colored the graph points with the labels found using K-Means and added the stock symbol 
# in the `hover_cols` parameter to identify the stocks represented by each data point.

market_data_predictions_plot = market_data_predictions.hvplot.scatter(
    title = 'KMeans Top 25 Stocks in SP500 Cluster',
    x = 'open',
    y = 'adjclose', 
    hover_cols = 'symbol',
    by = 'market_clusters'
)
market_data_predictions_plot