In [None]:
import pandas as pd

vectors = pd.read_parquet('../data/vectors/movies_overview_vectors.parquet')

In [None]:
vectors

In [None]:
metadata = pd.read_parquet('../data/transformed/movies_metadata.parquet')

In [None]:
vectors.columns = ['id', 'vector']
vectors = vectors.merge(metadata, on="id")
del metadata

In [None]:
vectors = vectors[['id', 'title', 'vector']]

In [None]:
from sklearn.cluster import KMeans, DBSCAN

In [None]:
import numpy as np
vectors = vectors[~vectors['vector'].isnull()]

In [None]:
def kmeans_clustering(data, n_clusters=4, random_state=42):
    """
    Perform KMeans clustering on 300-dimensional data.
    :param data: numpy array of shape (n_samples, 300)
    :param n_clusters: Number of clusters to form.
    :param random_state: Random state for reproducibility.
    :return: Cluster labels and KMeans model.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = kmeans.fit_predict(data)
    return labels, kmeans

In [None]:
labels, kmeans = kmeans_clustering(data=np.vstack(vectors['vector']))

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt


def plot_kmeans_clusters(data, labels):
    """
    Visualize KMeans clusters using PCA for dimensionality reduction.
    :param data: 300-dimensional data (n_samples, 300)
    :param labels: Cluster labels assigned by KMeans
    """
    pca = PCA(n_components=2)  # Reduce to 2D for visualization
    reduced_data = pca.fit_transform(data)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis', alpha=0.7)
    plt.colorbar(scatter, label="Cluster")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.title("KMeans Clustering Visualization (PCA Reduced)")
    plt.show()

In [None]:
kmeans

In [None]:
plot_kmeans_clusters(np.vstack(vectors['vector']), labels=labels)

In [None]:
def report_cluster():
    unique_clusters = np.unique(labels)
    print("\nKMeans Cluster Report:")
    for cluster in unique_clusters:
        cluster_indices = np.where(labels == cluster)[0]
        print(f"Cluster {cluster}: {len(cluster_indices)} points")

In [None]:
report_cluster()