In [1]:
from pathlib import Path
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings 
warnings.filterwarnings("ignore")

In [2]:
testData = pd.read_csv(
   Path("resources/foodPrice.csv"))

In [3]:
testData.tail(10)

Unnamed: 0,Cereals-and-bakery-products,"Meats,-poultry,-fish,-and-eggs",Dairy-and-related-products,Fruits-and-vegetables,Other-food-at-home,Date
516,349.294,322.737,272.04,351.029,264.746,2023-01-01
517,350.772,321.803,272.271,351.249,266.612,2023-02-01
518,353.866,318.306,271.384,345.814,268.567,2023-03-01
519,353.742,318.996,271.102,346.269,269.657,2023-04-01
520,354.195,315.968,269.573,350.986,269.999,2023-05-01
521,355.074,315.561,268.338,350.724,269.995,2023-06-01
522,356.377,317.445,269.143,351.646,271.276,2023-07-01
523,356.563,318.944,268.281,350.428,271.648,2023-08-01
524,355.576,320.14,268.377,351.02,271.561,2023-09-01
525,355.752,322.536,268.326,351.952,272.215,2023-10-01


In [4]:
testData.describe()

Unnamed: 0,Cereals-and-bakery-products,"Meats,-poultry,-fish,-and-eggs",Dairy-and-related-products,Fruits-and-vegetables,Other-food-at-home
count,526.0,526.0,526.0,526.0,526.0
mean,198.894451,177.208759,166.879658,213.989567,161.562375
std,69.116951,62.215261,49.43455,76.93223,44.473483
min,79.7,87.8,87.3,76.0,83.0
25%,142.875,130.475,125.4,150.275,125.725
50%,195.6,161.95,166.45,214.75,160.4
75%,268.351,232.4215,216.125,286.84025,204.61325
max,356.563,322.737,272.271,351.952,272.215


In [5]:
testData.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [18]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
testDataScaled = StandardScaler().fit_transform(testData[['Cereals-and-bakery-products', 'Meats,-poultry,-fish,-and-eggs',	'Dairy-and-related-products','Fruits-and-vegetables', 'Other-food-at-home']])
test_scaled[0:5]

# Create a DataFrame with the scaled data
testDataScaled = pd.DataFrame(
    testDataScaled,
    columns=['Cereals-and-bakery-products', 'Meats,-poultry,-fish,-and-eggs',	'Dairy-and-related-products','Fruits-and-vegetables', 'Other-food-at-home'])

testDataScaled['date'] = testData.Date

testDataScaled = testDataScaled.set_index('date')

testDataScaled.tail(10)

Unnamed: 0_level_0,Cereals-and-bakery-products,"Meats,-poultry,-fish,-and-eggs",Dairy-and-related-products,Fruits-and-vegetables,Other-food-at-home
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,2.178087,2.341335,2.129289,1.782996,2.322324
2023-02-01,2.199491,2.326308,2.133966,1.785859,2.364322
2023-03-01,2.244299,2.270047,2.116006,1.715145,2.408322
2023-04-01,2.242503,2.281148,2.110296,1.721065,2.432855
2023-05-01,2.249063,2.232432,2.079337,1.782437,2.440552
2023-06-01,2.261793,2.225884,2.054331,1.779028,2.440462
2023-07-01,2.280663,2.256195,2.070631,1.791024,2.469293
2023-08-01,2.283356,2.280311,2.053177,1.775177,2.477666
2023-09-01,2.269063,2.299553,2.055121,1.782879,2.475708
2023-10-01,2.271612,2.338101,2.054088,1.795005,2.490427


In [16]:
# Create a list with the number of k-values from 1 to 11
# Create an empty list to store the inertia values
inertia = []
k = list(range(1, 11))

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    kModel = KMeans(n_clusters=i, random_state=1)
    kModel.fit(testDataScaled)
    inertia.append(kModel.inertia_)

# Create a dictionary with the data to plot the Elbow curve
# Create a DataFrame with the data to plot the Elbow curve
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbowData = {'k': k, 'inertia': inertia}
elbowDataDF = pd.DataFrame(elbowData)
elbowPlot = elbowDataDF.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Curve',
    xticks=k
)
elbowPlot

In [17]:
# Initialize the K-Means model using the best value for k
# Fit the K-Means model using the scaled data
# Predict the clusters to group the scaled data
# Print the resulting array of cluster values.
model = KMeans(n_clusters=3)
modelClusters = model.fit_predict(testDataScaled)
print(modelClusters)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [19]:
# Create a copy of the DataFrame
# Add a new column to the DataFrame with the predicted clusters
# Display sample data
testDataScaledPredictions = testDataScaled.copy()
testDataScaledPredictions['predicted_clusters'] = modelClusters
testDataScaledPredictions.head()

Unnamed: 0_level_0,Cereals-and-bakery-products,"Meats,-poultry,-fish,-and-eggs",Dairy-and-related-products,Fruits-and-vegetables,Other-food-at-home,predicted_clusters
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-01-01,-1.726174,-1.396625,-1.611331,-1.788853,-1.768181,2
1980-02-01,-1.713141,-1.407887,-1.601207,-1.795358,-1.738922,2
1980-03-01,-1.704451,-1.398234,-1.595132,-1.777143,-1.711914,2
1980-04-01,-1.687073,-1.414322,-1.578934,-1.740713,-1.691658,2
1980-05-01,-1.675487,-1.435238,-1.546537,-1.715992,-1.671402,2


In [20]:
# Create a scatter plot using hvPlot by setting 
# Color the graph points with the labels found using K-Means and 
# add the date in the `hover_cols` parameter to identify 
testDataScaledPredictionsPlot = testDataScaledPredictions.hvplot.scatter(
    x='Cereals-and-bakery-products',
    y='Other-food-at-home',
    by='predicted_clusters',
    hover_cols=['date']
)
testDataScaledPredictionsPlot

In [26]:
# Create a PCA model instance and set `n_components=3`.
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
# View the first five rows of the DataFrame. 
pca = PCA(n_components=3)
pcaPrice = pca.fit_transform(testDataScaledPredictions)
pcaPrice[:5]

array([[-3.83878152, -0.16233271, -0.33644473],
       [-3.82409105, -0.15917138, -0.33102416],
       [-3.79475293, -0.1501056 , -0.32513132],
       [-3.76340924, -0.15335103, -0.28741836],
       [-3.73462334, -0.15691012, -0.25090504]])

In [27]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
import numpy as np
total_explained_var = np.sum(pca.explained_variance_ratio_)
total_explained_var

0.9968265668635926

In [29]:
# Create a new DataFrame with the PCA data.
# Creating a DataFrame with the PCA data
# Copy the crypto names from the original data
# Set the coinid column as index
# Display sample data
pcaPriceDf = pd.DataFrame(
    pcaPrice,
    columns=['PCA1', 'PCA2', 'PCA3'])

pcaPriceDf['date'] = testData.Date

pcaPriceDf = pcaPriceDf.set_index('date')

pcaPriceDf.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980-01-01,-3.838782,-0.162333,-0.336445
1980-02-01,-3.824091,-0.159171,-0.331024
1980-03-01,-3.794753,-0.150106,-0.325131
1980-04-01,-3.763409,-0.153351,-0.287418
1980-05-01,-3.734623,-0.15691,-0.250905


In [30]:
# Create a list with the number of k-values from 1 to 11
# Create an empty list to store the inertia values
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list

inertia = []
k = list(range(1, 11))

for i in k:
    kModelPCA = KMeans(n_clusters=i, random_state=0)
    kModelPCA.fit(price_pca_df)
    inertia.append(kModelPCA.inertia_)

# Create a dictionary with the data to plot the Elbow curve
# Create a DataFrame with the data to plot the Elbow curve

pcaElbowData = {'k': k, 'inertia': inertia}
pcaElbowDF = pd.DataFrame(pcaElbowData)
pcaElbowDF.head()

Unnamed: 0,k,inertia
0,1,2985.069169
1,2,774.784969
2,3,304.725174
3,4,197.252356
4,5,130.434147


In [31]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
pcaElbowDataPlot = pcaElbowDF.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Curve',
    xticks=k
)
pcaElbowDataPlot

In [32]:
# Initialize the K-Means model using the best value for k
# Fit the K-Means model using the PCA data
# Predict the clusters to group the PCA data
# Print the resulting array of cluster values.

model = KMeans(n_clusters=3, random_state=0)
pcaModelClusters = model.fit_predict(pcaPriceDf)
pcaModelClusters

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [33]:
# Create a copy of the DataFrame with the PCA data
# Add a new column to the DataFrame with the predicted clusters
# Display sample data

pcaPriceDfPredictions = pcaPriceDf.copy()
pcaPriceDfPredictions['predicted_clusters'] = pcaModelClusters
pcaPriceDfPredictions.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3,predicted_clusters
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-01-01,-3.838782,-0.162333,-0.336445,2
1980-02-01,-3.824091,-0.159171,-0.331024,2
1980-03-01,-3.794753,-0.150106,-0.325131,2
1980-04-01,-3.763409,-0.153351,-0.287418,2
1980-05-01,-3.734623,-0.15691,-0.250905,2


In [43]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the date name in the `hover_cols` parameter to identify 
pcaPriceDfPredictionsPlot = pcaPriceDfPredictions.hvplot.scatter(
    x='PCA1',
    y='PCA2',
    z='PCA3',
    by='predicted_clusters',
    hover_cols=['date']
)
pcaPriceDfPredictionsPlot



In [36]:
# alernative plot option after research
import plotly.express as px

pcaPriceDfPredictionsPlot = pcaPriceDfPredictions.reset_index()
fig = px.scatter_3d(pcaPriceDfPredictions, x='PCA1', y='PCA2', z='PCA3',
                    color='predicted_clusters')

fig.show()

In [37]:
# Composite plot to contrast the Elbow curves
elbowPlot + pcaElbowDataPlot

In [45]:
# Composite plot to contrast the clusters
testDataScaledPredictionsPlot + pcaPriceDfPredictionsPlot