K-Means Clustering with Yahoo Finance

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Get 5 years of data for AAPL
start_date = '2018-10-23'
end_date = '2023-10-23'
aapl = yf.download('AAPL', start = start_date, end = end_date)
display(aapl)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-23,53.957500,55.812500,53.674999,55.682499,53.057514,155071200
2018-10-24,55.650002,56.057499,53.634998,53.772499,51.237541,163702000
2018-10-25,54.427502,55.345001,54.187500,54.950001,52.359531,119423200
2018-10-26,53.974998,55.047501,53.167500,54.075001,51.525780,189033600
2018-10-29,54.797501,54.922501,51.522499,53.060001,50.558636,183742000
...,...,...,...,...,...,...
2023-10-16,176.750000,179.080002,176.509995,178.720001,177.810104,52517000
2023-10-17,176.649994,178.419998,174.800003,177.149994,176.248108,57549400
2023-10-18,175.580002,177.580002,175.110001,175.839996,174.944778,54764400
2023-10-19,176.039993,177.839996,175.190002,175.460007,174.566727,59302900


In [3]:
# Create a DataFrame and calculate the required columns
df = pd.DataFrame(aapl)
df['Close/30Day_MA'] = df['Adj Close'] / df['Adj Close'].rolling(window = 30).mean()
df['Future_5day_Percent_Change'] = df['Adj Close'].pct_change(periods = 5).shift(-5)
df['30Day_Rolling_Volatility'] = df['Adj Close'].pct_change().rolling(window = 30).std()

In [4]:
# Drop rows with NaN values
df = df.dropna()

In [5]:
# Prepare data for clustering
X = df[['Close/30Day_MA', 'Future_5day_Percent_Change', '30Day_Rolling_Volatility']]

In [6]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Perform K-means clustering
kmeans = KMeans(n_clusters = 3, random_state = 0)
df['Cluster'] = kmeans.fit_predict(X_scaled)

found 0 physical cores < 1
  File "c:\Users\wodnj\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [8]:
# Show centroids in original scale
centroids = kmeans.cluster_centers_
dfcentroids = pd.DataFrame(centroids, columns = X.columns)
display(dfcentroids)

Unnamed: 0,Close/30Day_MA,Future_5day_Percent_Change,30Day_Rolling_Volatility
0,-0.606065,-0.95858,0.14907
1,-1.013309,1.024339,1.321958
2,0.6033,0.262265,-0.430937


In [9]:
# Transform centroids back to the original scale
original_scale_centroids = scaler.inverse_transform(centroids)
dforiginal_scale_centroids = pd.DataFrame(original_scale_centroids, columns = X.columns)
display(dforiginal_scale_centroids)

Unnamed: 0,Close/30Day_MA,Future_5day_Percent_Change,30Day_Rolling_Volatility
0,0.980709,-0.032836,0.020458
1,0.956208,0.049259,0.030357
2,1.053467,0.017708,0.015562
