In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
data = pd.read_csv('cleaned_data.csv')
df = data.drop(columns=["title", "imdb_id"], errors='ignore')
df.fillna(df.mean())
df = df.apply(pd.to_numeric, errors='coerce')
df = (df - df.min()) / (df.max() - df.min())

In [39]:
df.head()

Unnamed: 0,drama,comedy,thriller,action,adventure,romance,crime,science fiction,horror,fantasy,...,way.1,weapon,welcome,woman.1,won,world.1,worlds,wrong.1,year.1,years.1
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.607659,0.0,0.0,0.460148,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
n, m = df.shape
k=7
np.random.seed(42)
clusters = np.random.rand(k, m)
print("Size of the cluster:- ", clusters.shape)


Size of the cluster:-  (7, 800)


In [41]:
n, m = df.shape
k = 7
np.random.seed(42)

# Method 1: K-means++ like initialization
def initialize_clusters(df, k):
    """Initialize clusters using a method similar to k-means++ to ensure they're well spread"""
    n_samples = df.shape[0]
    # Choose first centroid randomly
    clusters = np.zeros((k, df.shape[1]))

    # Choose the first centroid randomly
    first_centroid_idx = np.random.randint(0, n_samples)
    clusters[0] = df.iloc[first_centroid_idx].values

    # Choose remaining centroids
    for i in range(1, k):
        # Calculate distance from each point to nearest existing centroid
        distances = np.zeros(n_samples)
        for j in range(n_samples):
            point = df.iloc[j].values
            min_dist = float('inf')
            for c in range(i):
                dist = np.linalg.norm(point - clusters[c])
                min_dist = min(min_dist, dist)
            distances[j] = min_dist

        # Choose next centroid with probability proportional to distance squared
        distances = distances ** 2
        probabilities = distances / distances.sum()
        next_centroid_idx = np.random.choice(n_samples, p=probabilities)
        clusters[i] = df.iloc[next_centroid_idx].values

    return clusters

# Initialize clusters using the improved method
clusters = initialize_clusters(df, k)
print("Size of the cluster:", clusters.shape)

# Alternative Method 2: Pick points far from each other
def initialize_clusters_distant(df, k):
    """Initialize clusters by picking distant points from the dataset"""
    n_samples = df.shape[0]
    clusters = np.zeros((k, df.shape[1]))

    # Pick a random point as the first centroid
    first_centroid_idx = np.random.randint(0, n_samples)
    clusters[0] = df.iloc[first_centroid_idx].values
    selected_indices = [first_centroid_idx]

    for i in range(1, k):
        max_min_distance = -1
        farthest_idx = -1

        # Find the point with the maximum minimum distance to existing centroids
        for j in range(n_samples):
            if j in selected_indices:
                continue

            point = df.iloc[j].values
            min_dist = float('inf')
            for c in range(i):
                dist = np.linalg.norm(point - clusters[c])
                min_dist = min(min_dist, dist)

            if min_dist > max_min_distance:
                max_min_distance = min_dist
                farthest_idx = j

        clusters[i] = df.iloc[farthest_idx].values
        selected_indices.append(farthest_idx)

    return clusters

# Uncomment to use this alternative method instead
clusters = initialize_clusters_distant(df, k)
print("Size of the cluster:", clusters.shape)

Size of the cluster: (7, 800)
Size of the cluster: (7, 800)


In [42]:
index_to_cluster = {}
cluster_to_index = {}

for point_index in range(n):
  index_to_cluster[point_index] = -1

for cluster_index in range(k):
  cluster_to_index[cluster_index] = []

In [48]:
epochs = 30

In [44]:
def find_distance_of_point_from_cluster(point_index, cluster_index):
    point_data = df.iloc[point_index].values  # Convert to numpy array
    if np.isnan(point_data).any() or np.isnan(clusters[cluster_index]).any():
        return float('inf')  # Return infinity for points with NaN values
    return np.linalg.norm(point_data - clusters[cluster_index])

In [45]:
def find_nearest_cluster_from_point(point_index):
    min_distance = float('inf')
    nearest_cluster = None  # use None for clarity
    for cluster_index in range(k):
        distance = find_distance_of_point_from_cluster(point_index, cluster_index)
        if distance < min_distance:
            min_distance = distance
            nearest_cluster = cluster_index
    if nearest_cluster is None:
        raise ValueError(f"No nearest cluster found for point {point_index}")
    return nearest_cluster


In [46]:
def update_cluster_centers():
    global clusters
    updated = False
    new_clusters = np.copy(clusters)

    for cluster_index in range(k):
        cluster_points = cluster_to_index[cluster_index]
        if len(cluster_points) == 0:
            continue
        new_center = np.mean(df.iloc[cluster_points], axis=0)
        if not np.allclose(clusters[cluster_index], new_center):
            updated = True
        new_clusters[cluster_index] = new_center

    clusters = new_clusters
    return updated


In [49]:
for i in range(epochs):
    print(f"\nEpoch {i + 1}/{epochs} ---------------------------")

    # Clear the cluster_to_index map at each epoch
    cluster_to_index = {j: [] for j in range(k)}

    # Assign each point to the nearest cluster
    for point_index in range(n):
        nearest_cluster = find_nearest_cluster_from_point(point_index)
        index_to_cluster[point_index] = nearest_cluster
        cluster_to_index[nearest_cluster].append(point_index)

    # Log number of points in each cluster
    for cluster_id in range(k):
        print(f"Cluster {cluster_id}: {len(cluster_to_index[cluster_id])} points")

    # Update cluster centers
    updated = update_cluster_centers()

    if not updated:
        print("Converged. No changes in cluster centers.")
        break
    else:
        print("Cluster centers updated.")



Epoch 1/30 ---------------------------
Cluster 0: 1584 points
Cluster 1: 736 points
Cluster 2: 333 points
Cluster 3: 802 points
Cluster 4: 855 points
Cluster 5: 215 points
Cluster 6: 475 points
Cluster centers updated.

Epoch 2/30 ---------------------------
Cluster 0: 1582 points
Cluster 1: 737 points
Cluster 2: 333 points
Cluster 3: 804 points
Cluster 4: 855 points
Cluster 5: 215 points
Cluster 6: 474 points
Cluster centers updated.

Epoch 3/30 ---------------------------
Cluster 0: 1581 points
Cluster 1: 738 points
Cluster 2: 333 points
Cluster 3: 804 points
Cluster 4: 855 points
Cluster 5: 215 points
Cluster 6: 474 points
Cluster centers updated.

Epoch 4/30 ---------------------------
Cluster 0: 1581 points
Cluster 1: 738 points
Cluster 2: 333 points
Cluster 3: 804 points
Cluster 4: 855 points
Cluster 5: 215 points
Cluster 6: 474 points
Converged. No changes in cluster centers.


In [50]:
def predict_movie_from_title(title, data, df, index_to_cluster, cluster_to_index, top_n=10):

    try:
        movie_idx = data[data['title'] == title].index[0]
    except (IndexError, KeyError):
        return ["Movie title not found in the dataset. Please check the spelling."]

    try:
        movie_cluster = index_to_cluster[movie_idx]
    except KeyError:
        movie_cluster = find_nearest_cluster_from_point(movie_idx)

    cluster_movies = cluster_to_index[movie_cluster]

    if len(cluster_movies) <= top_n:
        top_n = len(cluster_movies) - 1
        if top_n <= 0:
            min_distance = float('inf')
            next_cluster = None
            for cluster_idx in range(len(cluster_to_index)):
                if cluster_idx == movie_cluster:
                    continue
                # Calculate distance to this cluster center
                distance = np.linalg.norm(df.iloc[movie_idx] - clusters[cluster_idx])
                if distance < min_distance:
                    min_distance = distance
                    next_cluster = cluster_idx

            # Add movies from the next closest cluster
            cluster_movies = cluster_to_index[next_cluster]
            top_n = min(10, len(cluster_movies))

    distances = []
    for idx in cluster_movies:
        if idx == movie_idx:
            continue
        distance = np.linalg.norm(df.iloc[movie_idx] - df.iloc[idx])
        distances.append((idx, distance))

    distances.sort(key=lambda x: x[1])

    recommended_indices = [idx for idx, _ in distances[:top_n]]

    # Get the titles of the recommended movies
    recommended_titles = data.iloc[recommended_indices]['title'].tolist()

    return recommended_titles


def predict_movie_from_title_simplified(title, top_n=10):
    return predict_movie_from_title(title, data, df, index_to_cluster, cluster_to_index, top_n)

In [51]:
print(predict_movie_from_title_simplified("Southpaw"))

['G.I. Jane', 'Absolute Power', 'Over the Top', 'Only the Brave', 'Free State of Jones', 'Enemy of the State', 'Seeking Justice', 'The Fan', 'Superman: Red Son', 'Heart of Stone']


In [53]:
import pickle
import pandas as pd
import numpy as np


model_package = {
    'clusters': clusters,
    'k': k,
    'index_to_cluster': index_to_cluster,
    'cluster_to_index': cluster_to_index,
    'data': data,
    'df': df,
    'data_columns': df.columns.tolist()
}

with open('movie_recommendation_system_clustering.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("Complete recommendation system saved to movie_recommendation_system.pkl")

Complete recommendation system saved to movie_recommendation_system.pkl
