In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from scipy.spatial import distance


In [47]:
dataset = pd.read_csv('merged_title_data_copy.csv', low_memory=False).head(4000)

dataset['runtimeMinutes'] = pd.to_numeric(dataset['runtimeMinutes'], errors='coerce')
dataset_cleaned = dataset.dropna(subset=['runtimeMinutes', 'averageRating', 'numVotes'])
dataset_cleaned = dataset_cleaned.drop_duplicates(subset=['tconst', 'primaryTitle']).dropna(subset=['tconst', 'primaryTitle'])

In [48]:
# One-hot encoding
title_encoded = pd.get_dummies(dataset_cleaned['titleType'], prefix='type')

# Fit and apply scalers on relevant columns
sc_runtime = StandardScaler().fit(dataset_cleaned['runtimeMinutes'].values.reshape(-1, 1))
sc_avgRating = StandardScaler().fit(dataset_cleaned['averageRating'].values.reshape(-1, 1))
sc_numVotes = StandardScaler().fit(dataset_cleaned['numVotes'].values.reshape(-1, 1))

dataset_cleaned['runtimeMinutes_normalized'] = sc_runtime.transform(dataset_cleaned['runtimeMinutes'].values.reshape(-1, 1))

# Split genres and apply MultiLabelBinarizer
X_genres = dataset_cleaned['genres'].str.split(',')
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(X_genres), columns=mlb.classes_, index=dataset_cleaned.index)

# Drop the original 'genres' column and concatenate scaled features
dataset_cleaned = dataset_cleaned.drop(['genres'], axis=1)
X_encoded = pd.concat([dataset_cleaned, title_encoded, genre_encoded], axis=1)
y = X_encoded['primaryTitle']


In [49]:
columns_to_drop = ['tconst', 'primaryTitle', 'titleType', 'startYear', 'averageRating', 'runtimeMinutes', 'numVotes']
columns_to_drop = [col for col in columns_to_drop if col in X_encoded.columns]
X_encoded_final = X_encoded.drop(columns=columns_to_drop, axis=1)

In [50]:
X = X_encoded_final
y = X_encoded['primaryTitle']
X_features = X_encoded_final.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [51]:
def create_input_vector(preferred_genres, preferred_runtime, isAdult, title_types):
    input_genres_encoded = pd.DataFrame(mlb.transform([preferred_genres]), columns=mlb.classes_) * 8
    runtime_normalized = sc_runtime.transform([[preferred_runtime]])
    input_vector = np.concatenate([input_genres_encoded.values.flatten(), runtime_normalized.flatten(), np.array([[isAdult]]).flatten(), title_types])
    print("Input vector shape:", input_vector.shape)
    return input_vector

In [52]:
title_types = [
    1 if title_type == 'movie' else 0 for title_type in 
    ['movie', 'tvMiniSeries', 'tvMovie', 'tvSeries', 'tvSpecial']
]


from scipy.spatial import distance

def get_closest_movies(input_vector, movie_vectors, movie_titles, top_n=200):
    distances = []

    for index, movie_vector in enumerate(movie_vectors):
        dist = distance.euclidean(input_vector, movie_vector)
        distances.append((dist, movie_titles.iloc[index]))

    distances = sorted(distances, key=lambda x: x[0])

    # Get the top N movies
    top_movies = [title for _, title in distances[:top_n]]
    return top_movies

In [53]:
def rank_recommendations(recommendations, dataset_cleaned):
    valid_recommendations = [title for title in recommendations if title in dataset_cleaned['primaryTitle'].values]
    
    if not valid_recommendations:
        return []

    # Filter dataset_cleaned to only keep the valid recommendations
    recommended_movies_df = dataset_cleaned[dataset_cleaned['primaryTitle'].isin(valid_recommendations)]

    recommended_movies_df = recommended_movies_df.drop_duplicates(subset=['tconst', 'primaryTitle'])
    ranked_movies = recommended_movies_df.sort_values(by=['averageRating', 'numVotes'], ascending=[False, False])

    # Select the top 20 recommendations
    top_movies = ranked_movies.head(20)['primaryTitle'].tolist()

    return top_movies



In [54]:
preferred_genres = ['Action', 'Adventure']
preferred_runtime = 136
isAdult = 0 
title_types = [1, 0, 0, 0, 0] 

# Create the input test vector
input_vector = create_input_vector(preferred_genres, preferred_runtime, isAdult, title_types)

# Get the top 200 movie recommendations
recommended_movies = get_closest_movies(input_vector, X_features, X_encoded['primaryTitle'], top_n=500)
ranked_movies = rank_recommendations(recommended_movies, dataset_cleaned)

print("Top ranked movie recommendations:", ranked_movies)



Input vector shape: (32,)
Top ranked movie recommendations: ['The Shawshank Redemption', 'The Godfather', 'The Dark Knight', 'The Lord of the Rings: The Return of the King', "Schindler's List", 'The Godfather Part II', 'Pulp Fiction', 'The Lord of the Rings: The Fellowship of the Ring', '12th Fail', 'Inception', 'Fight Club', 'Forrest Gump', 'The Lord of the Rings: The Two Towers', 'The Good, the Bad and the Ugly', 'Interstellar', 'The Matrix', 'Goodfellas', "One Flew Over the Cuckoo's Nest", 'Jai Bhim', 'Soorarai Pottru']
