# Dimensionality Reduction Analysis

## Enviroment Setup

In [None]:
%conda install -c conda-forge numpy pandas scikit-learn matplotlib seaborn -y

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import time

## Load Data

In [3]:
data = np.load('data/features.npz')
movieIds = data['movieId']
features = data['features']
movies = pd.concat([pd.read_csv('data/train_complete.csv'), pd.read_csv('data/test_complete.csv')], ignore_index=True)

print(f"Loaded features: {features.shape}")
print(f"Loaded movies: {len(movies)}")

Loaded features: (9337, 3345)
Loaded movies: 9337


## Standarize Features

In [4]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

n_components = 100

## PCA

In [5]:
start = time.time()
pca = PCA(n_components=n_components, random_state=42)
features_pca = pca.fit_transform(features_scaled)
pca_time = time.time() - start

print(f"Time: {pca_time:.2f}s")
print(f"Shape: {features_pca.shape}")
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_.sum():.4f}")

Time: 1.23s
Shape: (9337, 100)
Explained Variance Ratio: 0.4363


## SVD

In [6]:
start = time.time()
svd = TruncatedSVD(n_components=n_components, random_state=42)
features_svd = svd.fit_transform(features_scaled)
svd_time = time.time() - start

print(f"Time: {svd_time:.2f}s")
print(f"Shape: {features_svd.shape}")
print(f"Explained Variance Ratio: {svd.explained_variance_ratio_.sum():.4f}")

Time: 0.75s
Shape: (9337, 100)
Explained Variance Ratio: 0.4355


## LDA

In [7]:
genres_series = movies.set_index('movieId')['genres'].reindex(movieIds)
genres_available = genres_series.notna()

genres_split = genres_series[genres_available].str.split('|')
primary_genre = genres_split.str[0]

unique_genres = primary_genre.value_counts()
top_genres = unique_genres[unique_genres >= 20].index.tolist()

genre_mask = primary_genre.isin(top_genres)

features_for_lda = features_scaled[genres_available][genre_mask]
labels_for_lda = primary_genre[genre_mask]

n_classes = len(labels_for_lda.unique())
n_components_lda = min(n_classes - 1, features_for_lda.shape[1], n_components)

start = time.time()
lda = LDA(n_components=n_components_lda)
features_lda = lda.fit_transform(features_for_lda, labels_for_lda)
lda_time = time.time() - start

print(f"Time: {lda_time:.2f}s")
print(f"Shape: {features_lda.shape}")
print(f"Explained variance: {lda.explained_variance_ratio_.sum():.4f}")
print(f"Movies used: {len(features_lda)}/{len(movieIds)}")

Time: 8.33s
Shape: (9321, 15)
Explained variance: 1.0000
Movies used: 9321/9337


## Evaluate with Clustering Metrics

In [11]:
methods = {'PCA': features_pca, 'SVD': features_svd, 'LDA': features_lda}

results = []

for method_name, features_reduced in methods.items():
    print(f"Evaluating {method_name}...")
    
    kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
    labels = kmeans.fit_predict(features_reduced)
    
    silhouette = silhouette_score(features_reduced, labels)
    davies_bouldin = davies_bouldin_score(features_reduced, labels)
    calinski_harabasz = calinski_harabasz_score(features_reduced, labels)
    
    results.append({
        'Method': method_name,
        'Silhouette': silhouette,
        'Davies-Bouldin': davies_bouldin,
        'Calinski-Harabasz': calinski_harabasz
    })
    
    print(f"\tSilhouette: {silhouette:.4f}")
    print(f"\tDavies-Bouldin: {davies_bouldin:.4f}")
    print(f"\tCalinski-Harabasz: {calinski_harabasz:.2f}")

results_df = pd.DataFrame(results)
display(results_df)

Evaluating PCA...
	Silhouette: 0.0062
	Davies-Bouldin: 4.3339
	Calinski-Harabasz: 171.93
Evaluating SVD...
	Silhouette: 0.0054
	Davies-Bouldin: 4.3600
	Calinski-Harabasz: 172.25
Evaluating LDA...
	Silhouette: 0.0842
	Davies-Bouldin: 2.0051
	Calinski-Harabasz: 417.96


Unnamed: 0,Method,Silhouette,Davies-Bouldin,Calinski-Harabasz
0,PCA,0.006236,4.333916,171.925201
1,SVD,0.005371,4.360016,172.246597
2,LDA,0.084182,2.005097,417.959045


## Rank Methods

In [12]:
results_df['Silhouette_rank'] = results_df['Silhouette'].rank(ascending=False)
results_df['DB_rank'] = results_df['Davies-Bouldin'].rank(ascending=True)
results_df['CH_rank'] = results_df['Calinski-Harabasz'].rank(ascending=False)
results_df['Average_rank'] = (results_df['Silhouette_rank'] + 
                               results_df['DB_rank'] + 
                               results_df['CH_rank']) / 3

results_df_sorted = results_df.sort_values('Average_rank')

print("Ranking:")
display(results_df_sorted[['Method', 'Average_rank', 'Silhouette', 'Davies-Bouldin', 'Calinski-Harabasz']])

top_2_methods = results_df_sorted.head(2)['Method'].tolist()

print(f"Top 2 Methods: {top_2_methods}")

Ranking:


Unnamed: 0,Method,Average_rank,Silhouette,Davies-Bouldin,Calinski-Harabasz
2,LDA,1.0,0.084182,2.005097,417.959045
0,PCA,2.333333,0.006236,4.333916,171.925201
1,SVD,2.666667,0.005371,4.360016,172.246597


Top 2 Methods: ['LDA', 'PCA']
