# Clustering

## Setup

In [None]:
%conda install -c conda-forge numpy pandas matplotlib seaborn scipy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.spatial.distance import cdist
from scipy.stats import multivariate_normal
import time

In [None]:
data_pca = np.load('data/features_pca.npz')
features_pca = data_pca['features_pca']
movieIds_pca = data_pca['movieIds_pca']

data_svd = np.load('data/features_svd.npz')
features_svd = data_svd['features_svd']
movieIds_svd = data_svd['movieIds_svd']

data_lda = np.load('data/features_lda.npz')
features_lda = data_lda['features_lda']
movieIds_lda = data_lda['movieIds_lda']

movies = pd.read_csv('data/train_complete.csv')
movies = pd.concat([movies, pd.read_csv('data/test_complete.csv')], ignore_index=True)

## Selected Algorithm

1. K-means (Partitioning method)
2. Gaussian Mixture Model - GMM (Distriburion-based method)

### K-means

- Industry standard for image clustering
- Fast and scalable for large datasers
- Works well when clusters are spherical and similar in size
- Easy to interpret: each movie belongs to exactly one cluster
- Suitable for poster features where visual styles form distinct groups

In [None]:
class KMeans:
    def __init__(self, n_clusters=10, max_iters=100, tol=1e-4, random_state=42):
        self.n_clusters = n_clusters
        self.max_iters = max_iters
        self.tol = tol
        self.random_state = random_state
        
    def fit(self, X):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        
        self.centroids = self._kmeans_plus_plus(X)
        self.labels_ = np.zeros(n_samples, dtype=int)
        self.inertia_ = 0.0
        
        for iteration in range(self.max_iters):
            distances = cdist(X, self.centroids, metric='euclidean')
            self.labels_ = np.argmin(distances, axis=1)
            
            new_centroids = np.array([X[self.labels_ == k].mean(axis=0) 
                                     for k in range(self.n_clusters)])
            
            if np.allclose(self.centroids, new_centroids, atol=self.tol):
                print(f"Converged at iteration {iteration + 1}")
                break
                
            self.centroids = new_centroids
        
        self.inertia_ = np.sum((X - self.centroids[self.labels_])**2)
        
        return self
    
    def _kmeans_plus_plus(self, X):
        n_samples = X.shape[0]
        centroids = []
        
        first_idx = np.random.randint(n_samples)
        centroids.append(X[first_idx])
        
        for _ in range(1, self.n_clusters):
            distances = cdist(X, np.array(centroids), metric='euclidean')
            min_distances = np.min(distances, axis=1)
            probabilities = min_distances ** 2
            probabilities /= probabilities.sum()
            
            next_idx = np.random.choice(n_samples, p=probabilities)
            centroids.append(X[next_idx])
        
        return np.array(centroids)
    
    def predict(self, X):
        distances = cdist(X, self.centroids, metric='euclidean')
        return np.argmin(distances, axis=1)
    
    def fit_predict(self, X):
        self.fit(X)
        return self.labels_

### GMM

- Probabilistic assignments: captures uncertainty in cluster memebership
- Flexible cluster shapes: can model elliptical clusters
- Better for overlapping styles: a poster can have mixed characteristics
- Natural for movie posters: genres often blend (action-comedy, scifi-fi-drama, etc)
- Provides probability scores useful for recommendations

In [None]:
class GaussianMixtureModel:
    def __init__(self, n_components=10, max_iters=100, tol=1e-4, random_state=42):
        self.n_components = n_components
        self.max_iters = max_iters
        self.tol = tol
        self.random_state = random_state
        
    def fit(self, X):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        
        self.weights_ = np.ones(self.n_components) / self.n_components
        
        indices = np.random.choice(n_samples, self.n_components, replace=False)
        self.means_ = X[indices]
        
        self.covariances_ = np.array([np.eye(n_features) for _ in range(self.n_components)])
        
        log_likelihood_old = 0
        
        for iteration in range(self.max_iters):
            responsibilities = self._e_step(X)
            self._m_step(X, responsibilities)
            log_likelihood = self._compute_log_likelihood(X)
            
            if abs(log_likelihood - log_likelihood_old) < self.tol:
                print(f"Converged at iteration {iteration + 1}")
                break
                
            log_likelihood_old = log_likelihood
        
        self.labels_ = self.predict(X)
        
        return self
    
    def _e_step(self, X):
        n_samples = X.shape[0]
        responsibilities = np.zeros((n_samples, self.n_components))
        
        for k in range(self.n_components):
            try:
                responsibilities[:, k] = self.weights_[k] * multivariate_normal.pdf(
                    X, mean=self.means_[k], cov=self.covariances_[k], allow_singular=True
                )
            except:
                responsibilities[:, k] = 1e-10
        
        responsibilities_sum = responsibilities.sum(axis=1, keepdims=True)
        responsibilities /= (responsibilities_sum + 1e-10)
        
        return responsibilities
    
    def _m_step(self, X, responsibilities):
        n_samples, n_features = X.shape
        
        Nk = responsibilities.sum(axis=0)
        self.weights_ = Nk / n_samples
        
        self.means_ = (responsibilities.T @ X) / Nk[:, np.newaxis]
        
        for k in range(self.n_components):
            diff = X - self.means_[k]
            weighted_diff = responsibilities[:, k][:, np.newaxis] * diff
            self.covariances_[k] = (weighted_diff.T @ diff) / Nk[k]
            
            self.covariances_[k] += np.eye(n_features) * 1e-6
    
    def _compute_log_likelihood(self, X):
        n_samples = X.shape[0]
        log_likelihood = 0
        
        for k in range(self.n_components):
            try:
                log_likelihood += np.sum(
                    np.log(self.weights_[k] * multivariate_normal.pdf(
                        X, mean=self.means_[k], cov=self.covariances_[k], allow_singular=True
                    ) + 1e-10)
                )
            except:
                pass
        
        return log_likelihood / n_samples
    
    def predict(self, X):
        responsibilities = self._e_step(X)
        return np.argmax(responsibilities, axis=1)
    
    def fit_predict(self, X):
        self.fit(X)
        return self.labels_
    
    def predict_proba(self, X):
        return self._e_step(X)