In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("./data/Dataset.csv")

In [None]:
df.head()

In [None]:
#checking null values
print(df.isnull().sum())

In [None]:
#imputing 0 for null in Cholesterol/100g and Sodium/100g
df[['Cholesterol/100g', 'Sodium/100g']] = df[['Cholesterol/100g', 'Sodium/100g']].fillna(0)

In [None]:
#verifying all columns filled
print(df.isnull().sum())

In [None]:
# Standardizing the data, dropping non-numeric feature "Food"
features = df.drop(columns='Food')
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [None]:
# Performing PCA
pca = PCA()
pca.fit_transform(scaled_features)

In [None]:
# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Cumulative explained variance
cumulative_explained_variance = explained_variance_ratio.cumsum()

In [None]:
# Plotting cumulative explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Number of Components')
plt.axhline(y=0.95, color='r', linestyle='-')
plt.axhline(y=0.99, color='g', linestyle='-')
plt.show()

In [None]:
for i, evr in enumerate(explained_variance_ratio):
    print(f"Principal Component {i+1} variance explained: {evr*100:.2f}%")

## First big drop of proportion of variance is between PC2 and PC3, so try 2 principal components?

In [None]:
n_components = 2
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(scaled_features)

In [None]:
pca_columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=pca_columns)
pca_df = pd.concat([df[['Food']], pca_df], axis=1)

In [None]:
#pca_df.to_csv('data_with_prinicipal')

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c='blue')
for i, food in enumerate(pca_df['Food']):
    plt.text(pca_df['PC1'][i], pca_df['PC2'][i], food)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Food Nutritional Data')
plt.grid()
plt.show()

In [None]:
loadings_pc1 = pca.components_[0]
loadings_pc2 = pca.components_[1]

loadings_df_pc1 = pd.DataFrame(loadings_pc1, index=features.columns, columns=['PC1'])
loadings_df_pc2 = pd.DataFrame(loadings_pc2, index=features.columns, columns=['PC2'])

print("Loadings for PC1:")
print(loadings_df_pc1)
print("\n")
print("Loadings for PC2:")
print(loadings_df_pc2)


In [None]:
pca_df

In [None]:
n_clusters = 3
kmeans1 = KMeans(n_clusters=n_clusters, random_state=23)
kmeans1.fit(principal_components)
pca_df['Cluster_3'] = kmeans1.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster_3'], cmap='viridis', marker='o')
#for i, txt in enumerate(pca_df['Food']):
#    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA and K-Means 3 Clustering')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_3'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id + 1}: {', '.join(cluster_records)}\n")

In [None]:
kmeans1 = KMeans(n_clusters=3, random_state=23)
kmeans1.fit(principal_components)
pca_df['Cluster_3'] = kmeans1.labels_

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_4'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id}: {', '.join(cluster_records)}")

In [None]:
n_clusters = 5
kmeans3 = KMeans(n_clusters=n_clusters, random_state=23)
kmeans3.fit(principal_components)
pca_df['Cluster_5'] = kmeans3.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster_5'], cmap='viridis', marker='o')
#for i, txt in enumerate(pca_df['Food']):
#    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA and K-Means 5 Clustering')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_5'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id+1}: {', '.join(cluster_records)}\n")

In [None]:
n_clusters = 4
kmeans1 = KMeans(n_clusters=n_clusters, random_state=23)
kmeans1.fit(principal_components)
pca_df['Cluster_4'] = kmeans1.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster_4'], cmap='viridis', marker='o')
#for i, txt in enumerate(pca_df['Food']):
#    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA and K-Means 4 Clustering')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()

In [None]:
kmeans4 = KMeans(n_clusters=6, random_state=23)
kmeans4.fit(principal_components)
pca_df['Cluster_6'] = kmeans4.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster_6'], cmap='viridis', marker='o')
#for i, txt in enumerate(pca_df['Food']):
#    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA and K-Means 6 Clustering')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_7'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id + 1}: {', '.join(cluster_records)}\n")

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_6'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id}: {', '.join(cluster_records)}")

In [None]:
n_clusters = 7
kmeans5 = KMeans(n_clusters=n_clusters, random_state=23)
kmeans5.fit(principal_components)
pca_df['Cluster_7'] = kmeans5.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster_7'], cmap='viridis', marker='o')
#for i, txt in enumerate(pca_df['Food']):
#    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA and K-Means 7 Clustering')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_7'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id + 1}: {', '.join(cluster_records)}\n")

In [None]:
n_clusters = 8
kmeans6 = KMeans(n_clusters=n_clusters, random_state=23)
kmeans6.fit(principal_components)
pca_df['Cluster_8'] = kmeans6.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster_8'], cmap='viridis', marker='o')
#for i, txt in enumerate(pca_df['Food']):
#    plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA and K-Means 8 Clustering')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()

In [None]:
for cluster_id in range(n_clusters):
    cluster_records = pca_df[pca_df['Cluster_8'] == cluster_id]['Food']
    print(f"Records in Cluster {cluster_id + 1}: {', '.join(cluster_records)}\n")