In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

# Load data
movie_data = pd.read_csv('C:/Users/user/Desktop/New folder (3)/ml-latest-small/movies.csv')
ratings_data = pd.read_csv('C:/Users/user/Desktop/New folder (3)/ml-latest-small/ratings.csv')
tags_data = pd.read_csv('C:/Users/user/Desktop/New folder (3)/ml-latest-small/tags.csv')

# Merge the Data
merged_data = pd.merge(movie_data, ratings_data, on='movieId', how='left')
merged_data = pd.merge(merged_data, tags_data, on=['movieId', 'userId'], how='left', suffixes=('_rating', '_tag'))

# Preprocessing
merged_data['tag'].fillna("", inplace=True)
merged_data['rating'].fillna(merged_data['rating'].mean(), inplace=True)
merged_data.drop(columns=['timestamp_rating', 'timestamp_tag'], inplace=True)

# Feature Engineering
# Average rating per movie
avg_rating_per_movie = merged_data.groupby('movieId')['rating'].mean().rename('avg_rating').reset_index()
merged_data = pd.merge(merged_data, avg_rating_per_movie, on='movieId', how='left')

# Number of ratings per movie
num_ratings_per_movie = merged_data.groupby('movieId')['rating'].count().rename('num_ratings').reset_index()
merged_data = pd.merge(merged_data, num_ratings_per_movie, on='movieId', how='left')

# Most common tag per movie
most_common_tag = tags_data['tag'].value_counts().idxmax()
merged_data['tag'] = merged_data['tag'].apply(lambda x: most_common_tag if x == "" else x)

# Data Encoding
# For genres
vect_genres = TfidfVectorizer(stop_words='english')
X_genres = vect_genres.fit_transform(merged_data['genres'])

# For tags
vect_tags = TfidfVectorizer(stop_words='english')
X_tags = vect_tags.fit_transform(merged_data['tag'])
# Clustering
#Choosing the number of clusters for KMeans
kmeans_genres = KMeans(n_clusters=10, random_state=42)
merged_data['genres_cluster'] = kmeans_genres.fit_predict(X_genres)

kmeans_tags = KMeans(n_clusters=5, random_state=42)
merged_data['tags_cluster'] = kmeans_tags.fit_predict(X_tags)

# Evaluate
# Silhouette score
genres_silhouette_score = silhouette_score(X_genres, merged_data['genres_cluster'])
tags_silhouette_score = silhouette_score(X_tags, merged_data['tags_cluster'])
print(f'Genres Silhouette Score: {genres_silhouette_score}')
print(f'Tags Silhouette Score: {tags_silhouette_score}')

# Interpret
order_centroids_genres = kmeans_genres.cluster_centers_.argsort()[:, ::-1]
terms_genres = vect_genres.get_feature_names_out()

order_centroids_tags = kmeans_tags.cluster_centers_.argsort()[:, ::-1]
terms_tags = vect_tags.get_feature_names_out()

for i in range(5):
    print(f"Genres Cluster {i}:")
    for ind in order_centroids_genres[i, :min(10, len(terms_genres))]:
        print(f' {terms_genres[ind]}')
    print()

for i in range(5):
    print(f"Tags Cluster {i}:")
    for ind in order_centroids_tags[i, :10]:
        print(f' {terms_tags[ind]}')
    print()




In [16]:
# Create a list of all genres
all_genres = [genre for sublist in list(movie_data['genres'].apply(lambda x: x.split('|')).values) for genre in sublist]

# Get unique genres and their counts
unique_genres, counts_genres = np.unique(all_genres, return_counts=True)

# Print out the unique genres and their counts
for genre, count in zip(unique_genres, counts_genres):
    print(f"{genre}: {count}")


(no genres listed): 34
Action: 1828
Adventure: 1263
Animation: 611
Children: 664
Comedy: 3756
Crime: 1199
Documentary: 440
Drama: 4361
Fantasy: 779
Film-Noir: 87
Horror: 978
IMAX: 158
Musical: 334
Mystery: 573
Romance: 1596
Sci-Fi: 980
Thriller: 1894
War: 382
Western: 167


In [11]:
# Randomly pick a movie
import random
# merged_data['cluster'] = KMeans.labels_  # Incorrect
merged_data['cluster'] = kmeans_genres.labels_  # Correct
random_movie = random.choice(merged_data['title'].unique())
movie_cluster = merged_data.loc[merged_data['title'] == random_movie, 'cluster'].values[0]

print(f"Because you watched {random_movie}, you might like:")

# Recommend other movies from the same cluster
recommendations = merged_data[merged_data['cluster'] == movie_cluster]['title'].unique()

# Print the recommendations
for rec in recommendations[:5]:
    print(rec)


Because you watched Midnight Meat Train, The (2008), you might like:
Dracula: Dead and Loving It (1995)
From Dusk Till Dawn (1996)
Mary Reilly (1996)
Vampire in Brooklyn (1995)
Addiction, The (1995)


In [12]:
from sklearn.metrics.cluster import calinski_harabasz_score, davies_bouldin_score

# For genres
genres_CH_score = calinski_harabasz_score(X_genres.toarray(), merged_data['genres_cluster'])
genres_DB_score = davies_bouldin_score(X_genres.toarray(), merged_data['genres_cluster'])

# For tags
tags_CH_score = calinski_harabasz_score(X_tags.toarray(), merged_data['tags_cluster'])
tags_DB_score = davies_bouldin_score(X_tags.toarray(), merged_data['tags_cluster'])

print(f'Genres Calinski-Harabasz Score: {genres_CH_score}')
print(f'Genres Davies-Bouldin Score: {genres_DB_score}')
print(f'Tags Calinski-Harabasz Score: {tags_CH_score}')
print(f'Tags Davies-Bouldin Score: {tags_DB_score}')


Genres Calinski-Harabasz Score: 14647.495660329982
Genres Davies-Bouldin Score: 1.4494124000797854
Tags Calinski-Harabasz Score: 26167.834246084905
Tags Davies-Bouldin Score: 1.0796048697849698
