# Dimensionality Reduction Analysis

## Enviroment Setup

In [None]:
%conda install -c conda-forge numpy pandas scikit-learn matplotlib seaborn -y

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import time

## Load Data

In [4]:
data = np.load('data/features.npz')
movieIds = data['movieId']
features = data['features']
movies = pd.concat([pd.read_csv('data/train_complete.csv'), pd.read_csv('data/test_complete.csv')], ignore_index=True)

print(f"Loaded features: {features.shape}")
print(f"Loaded movies: {len(movies)}")

Loaded features: (9337, 3345)
Loaded movies: 9337


## Standarize Features

In [5]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

n_components = 100

## PCA

In [11]:
start = time.time()
pca = PCA(n_components=n_components, random_state=42)
features_pca = pca.fit_transform(features_scaled)
end = time.time()
pca_time = end - start

print(f"Time: {pca_time:.2f}s")
print(f"Shape: {features_pca.shape}")
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_.sum():.4f}")

Time: 1.08s
Shape: (9337, 100)
Explained Variance Ratio: 0.4363


## SVD

In [12]:
start = time.time()
svd = TruncatedSVD(n_components=n_components, random_state=42)
features_svd = svd.fit_transform(features_scaled)
end = time.time()
svd_time = end - start

print(f"Time: {svd_time:.2f}s")
print(f"Shape: {features_svd.shape}")
print(f"Explained Variance Ratio: {svd.explained_variance_ratio_.sum():.4f}")

Time: 0.90s
Shape: (9337, 100)
Explained Variance Ratio: 0.4355


## LDA

In [13]:
genres_series = movies.set_index('movieId')['genres'].reindex(movieIds)
genres_available = genres_series.notna()

genres_split = genres_series[genres_available].str.split('|')
primary_genre = genres_split.str[0]

unique_genres = primary_genre.value_counts()
top_genres = unique_genres[unique_genres >= 50].index.tolist()

genre_mask = primary_genre.isin(top_genres)

features_for_lda = features_scaled[genres_available][genre_mask]
labels_for_lda = primary_genre[genre_mask]

n_classes = len(labels_for_lda.unique())
n_components_lda = min(n_classes - 1, features_for_lda.shape[1], n_components)

start = time.time()
lda = LDA(n_components=n_components_lda)
features_lda = lda.fit_transform(features_for_lda, labels_for_lda)
end = time.time()
lda_time = end - start

print(f"Time: {lda_time:.2f}s")
print(f"Shape: {features_lda.shape}")
print(f"Explained Variance Ratio: {np.sum(lda.explained_variance_ratio_):.4f}")
print(f"Movies used: {len(features_for_lda)}/{len(movieIds)}")

Time: 7.48s
Shape: (9153, 10)
Explained Variance Ratio: 1.0000
Movies used: 9153/9337
