# Smart_Stock_ML: Stock Clustering

In [1]:
# Import dependencies
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import hvplot.pandas

## Import company data

In [2]:
#  Import clean company dataset
company_df = pd.read_csv('data/company_clean.csv')
company_df.head()

Unnamed: 0,Ticker,Year,GICS Sector,GICS Sub-Industry,Founded,Gender,CEO Transition,Tenure Bucket,HQ_US_State_or_Country,Salary_Bucket
0,A,2015,Health Care,Life Sciences Tools & Services,1999,M,1,<= 2 years,California,"(15000000, 20000000]"
1,A,2016,Health Care,Life Sciences Tools & Services,1999,M,0,<= 2 years,California,"(15000000, 20000000]"
2,A,2017,Health Care,Life Sciences Tools & Services,1999,M,0,<= 2 years,California,"(15000000, 20000000]"
3,A,2018,Health Care,Life Sciences Tools & Services,1999,M,0,2 to 5 years,California,"(15000000, 20000000]"
4,A,2019,Health Care,Life Sciences Tools & Services,1999,M,0,2 to 5 years,California,"(15000000, 20000000]"


In [3]:
# View columns
company_df.columns

Index(['Ticker', 'Year', 'GICS Sector', 'GICS Sub-Industry', 'Founded',
       'Gender', 'CEO Transition', 'Tenure Bucket', 'HQ_US_State_or_Country',
       'Salary_Bucket'],
      dtype='object')

## Extract stock data

In [4]:
# Create unique tickers list
unique_tickers_df = pd.DataFrame(company_df['Ticker'].unique(), columns=["Ticker"])

In [5]:
## Extract full dataset for all tickers on daily interval for date range 2015 - 2024
# Collect a list of failed stock downloads
# Setup years and dataframe
bad_tickers = []
mean_stocks_df = pd.DataFrame()

# Iterate through unique tickers and extract daily data, concatenate to master dataframe
for index,row in unique_tickers_df.iterrows():
    try:
        ticker = row["Ticker"]
        stock_data = yf.download(ticker, start="2015-01-01", end="2024-01-01", interval="1d")
        stock_df = pd.DataFrame(stock_data).reset_index()
        stock_df['Ticker'] = ticker
        stock_df['Year'] =  stock_df["Date"].iloc[0].year
        mean_stocks_df = pd.concat([mean_stocks_df, stock_df], ignore_index=True)
        
    except Exception as e:
        print(f'Error occured in stock download: {e}')
        bad_tickers.append(ticker)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [6]:
# Print bad tickers
bad_tickers

[]

In [7]:
# View dataset
mean_stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1097023 entries, 0 to 1097022
Data columns (total 9 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   Date       1097023 non-null  datetime64[ns]
 1   Open       1097023 non-null  float64       
 2   High       1097023 non-null  float64       
 3   Low        1097023 non-null  float64       
 4   Close      1097023 non-null  float64       
 5   Adj Close  1097023 non-null  float64       
 6   Volume     1097023 non-null  int64         
 7   Ticker     1097023 non-null  object        
 8   Year       1097023 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(2), object(1)
memory usage: 75.3+ MB


## Calculate metrics

In [None]:
# Calculate mean values on full annual data
mean_stocks_df['Year'] = mean_stocks_df['Date'].dt.year
# Group data by ticker and year
grouped_df = mean_stocks_df.groupby(['Ticker', 'Year'])
mean_values = grouped_df[['Open', 'High', 'Low', 'Close', 'Volume']].mean().rename(
    columns={'Open': 'MeanOpen', 'High': 'MeanHigh', 'Low': 'MeanLow', 'Close': 'MeanClose', 'Volume': 'MeanVolume'}
)

# Calculate annual return and annual variance on grouped yearly data
annual_return = grouped_df['Close'].apply(lambda x: (x.iloc[-1] - x.iloc[0]) / x.iloc[0])
annual_variance = grouped_df['Close'].var()

# Create final mean_stats dataframe
mean_stats_df = mean_values.copy()
mean_stats_df['AnnualReturn'] = annual_return
mean_stats_df['AnnualVariance'] = annual_variance

In [None]:
# Display results
mean_stats_df

In [None]:
# Reset index
mean_stats_df = mean_stats_df.reset_index().set_index('Ticker')
mean_stats_df

## Unsupervised learning: K-Means

### Preprocessing

In [None]:
# Scale numeric values
mean_scaled = StandardScaler().fit_transform(
    mean_stats_df[['Year','MeanOpen', 'MeanHigh', 'MeanLow', 'MeanClose', 'MeanVolume',
       'AnnualReturn', 'AnnualVariance']]
)

In [None]:
# Create a DataFrame with the scaled data
mean_scaled_df = pd.DataFrame(
    mean_scaled,
    columns=['Year', 'MeanOpen', 'MeanHigh', 'MeanLow', 'MeanClose', 'MeanVolume',
       'AnnualReturn', 'AnnualVariance']
)

# Copy the tickers names from the original data
mean_scaled_df["Ticker"] = mean_stats_df.index
# Set the Ticker column as index
mean_scaled_df = mean_scaled_df.set_index("Ticker")

# Display the scaled data
mean_scaled_df

In [None]:
# Select catagorical variables of interest
catagorical_variables = company_df[['Ticker', 'GICS Sector',
       'HQ_US_State_or_Country', 'Gender']]

# Set ticker as index
catagorical_variables = catagorical_variables.set_index("Ticker")
catagorical_variables

In [None]:
# Encode catagorical variables
stock_dummies = pd.get_dummies(catagorical_variables, dtype=int)
stock_dummies

In [None]:
# Concatenate the dummy variables with the scaled data DataFrame
sp500_clustering_df = pd.merge(mean_scaled_df, stock_dummies, on='Ticker', how="left").drop('Gender_M', axis=1)

# Display data sample
sp500_clustering_df = sp500_clustering_df.dropna()
sp500_clustering_df

### Apply the Elbow method

In [None]:
# Create a list to store inertia values
inertia = []

# Create a list to store the values of k
k_values = list(range(1, 11))

# Create a for-loop where each value of k and evaluate using the K-means algorithm
for k in k_values:
    k_model = KMeans(n_clusters=k, random_state=78)
    k_model.fit(sp500_clustering_df)
    inertia.append(k_model.inertia_)

In [None]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k_values, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

In [None]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)


### Create Clusters

In [None]:
# Initialize the K-Means model with 5 clusters
model = KMeans(n_clusters=5)

In [None]:
# Fit the model sp500_scaled_df DataFrame
model.fit(sp500_clustering_df)

In [None]:
# Predict the model segments
stock_clusters = model.predict(sp500_clustering_df)

# View the stock segments
stock_clusters[0:20]

In [None]:
# Create a new column in the DataFrame with the predicted clusters
sp500_clustering_df["StockCluster"] = stock_clusters

# Review the DataFrame
sp500_clustering_df.head()

### View Clusters

In [None]:
# Create a scatter plot - Annual Return vs Annual Variance - Color by Stock Cluster
sp500_clustering_df.hvplot.scatter(
    x="AnnualVariance",
    y="AnnualReturn",
    by="StockCluster",
    hover_cols = ["Ticker"], 
    title = "Stocks by Cluster - Annual Return vs Annual Variance"
)

In [None]:
# Create a scatter plot - Annual Return vs Annual Variance - Color by CEO Gender
sp500_clustering_df.hvplot.scatter(
    x="AnnualVariance",
    y="AnnualReturn",
    by="Gender_F",
    hover_cols = ["Ticker"], 
    title = "Stocks by CEO Gender - Annual Return vs Annual Variance"
)