In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt


In [None]:
# Load dataset
df = pd.read_csv("kaggle_Interests_group.csv")


In [None]:
# Drop columns with many missing values
def drop_missing_values_columns(df, threshold):
    cols_to_drop = [col for col in df.columns if df[col].isna().sum()
                    > threshold]
    df.drop(cols_to_drop, axis=1, inplace=True)
    return df


df = drop_missing_values_columns(df, 5000)


In [None]:
# Fill missing values with 1 - median
columns = df.columns.tolist()
columns.remove("group")
for col in columns:
    median = df[col].median()
    df[col].fillna(1 - median, inplace=True)


In [None]:
# Encode 'group' column
le = LabelEncoder()
df["group"] = le.fit_transform(df["group"])


In [None]:
# Select relevant features
df = df[["group", "grand_tot_interests", "interest47"]]

# Scale 'grand_tot_interests' using MinMaxScaler
scaler = MinMaxScaler()
df["grand_tot_interests"] = scaler.fit_transform(df[["grand_tot_interests"]])

# Extract features and labels
X = df.drop("group", axis=1)
y = df["group"]


In [None]:
# Standardize features before applying PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensions
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


In [None]:
# Visualize the explained variance by each principal component
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         pca.explained_variance_ratio_, marker='o')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.show()


In [None]:
# KMeans Clustering and Elbow Method
inertias = []
for i in range(2, 12):
    kmeans = KMeans(n_clusters=i).fit(X)
    inertias.append(kmeans.inertia_)
plt.plot(range(2, 12), inertias, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()


In [None]:
# KMeans with optimal clusters
kmeans = KMeans(n_clusters=3).fit(X_pca)

kmeans_labels = kmeans.labels_


In [None]:
# Visualize KMeans Clusters
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0],
            kmeans.cluster_centers_[:, 1], c="red")
plt.title('KMeans Clustering with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
# Calculate and print the silhouette score for KMeans
sil_score_kmeans = silhouette_score(X_pca, kmeans_labels)
print(f"Silhouette Score for KMeans: {sil_score_kmeans}")


In [None]:
# Hierarchical Clustering
X_scaled = StandardScaler().fit_transform(df.iloc[:, 1:])
linked = linkage(X_scaled, 'ward')
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Euclidean distances')
plt.show()


In [None]:
# Fit AgglomerativeClustering
n_clusters = 3  # Assuming 3 clusters from dendrogram
agg_clust = AgglomerativeClustering(
    n_clusters=n_clusters, affinity='euclidean', linkage='ward')
labels = agg_clust.fit_predict(X_scaled)


In [None]:
# Visualize Hierarchical Clusters
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='viridis')
plt.title('Hierarchical Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()


In [None]:
# Calculate and print the silhouette score
sil_score_hierarchical = silhouette_score(X_scaled, labels)
print(
    f"Silhouette Score for Hierarchical Clustering: {sil_score_hierarchical}")
