In [154]:
# Data Collection and Preprocessing

import pandas as pd
import difflib

# Define the file paths
movies_file = 'ml-1m/movies.dat'
ratings_file = 'ml-1m/ratings.dat'
users_file = 'ml-1m/users.dat'

# Load the datasets
movies = pd.read_csv(movies_file, delimiter='::', engine='python', names=['MovieID', 'Title', 'Genres'], encoding='latin1')
ratings = pd.read_csv(ratings_file, delimiter='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='latin1')
users = pd.read_csv(users_file, delimiter='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='latin1')


name_basics = pd.read_csv('name.basics.tsv', sep='\t', low_memory=False)
title_basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False)

# Display the first few rows of each dataset
print(movies.head())
print(ratings.head())
print(users.head())
print(name_basics.head())
print(title_basics.head())

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
      nconst      primaryName birthYear deathYear  \

In [156]:
# Feature Engineering

# Extract the year from the MovieLens title and standardize titles
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)', expand=False)
movies['Title'] = movies['Title'].str.replace(r'\(\d{4}\)', '').str.strip().str.lower()

# Standardize IMDb titles and remove NaNs
title_basics['primaryTitle'] = title_basics['primaryTitle'].str.lower().fillna('')
title_basics['originalTitle'] = title_basics['originalTitle'].str.lower().fillna('')

merged_data_primary = pd.merge(movies, title_basics, left_on='Title', right_on='primaryTitle', how='inner')

if merged_data_primary.empty:
    merged_data_original = pd.merge(movies, title_basics, left_on='Title', right_on='originalTitle', how='inner')
else:
    merged_data_original = pd.DataFrame()

merged_data = pd.concat([merged_data_primary, merged_data_original]).drop_duplicates()

# Merge with ratings
merged_ratings = pd.merge(ratings, merged_data, on='MovieID', how='inner')

# Remove duplicates
aggregated_ratings = merged_ratings.groupby(['UserID', 'MovieID']).agg({'Rating': 'mean'}).reset_index()

aggregated_ratings.to_csv('aggregated_ratings.csv', index=False)

print(aggregated_ratings.head())
print(f"Number of unique ratings: {len(aggregated_ratings)}")

   UserID  MovieID  Rating
0       1      260     4.0
1       1      588     4.0
2       1      595     5.0
3       1     1022     5.0
4       1     1028     5.0
Number of unique ratings: 223646


In [161]:
# Feature Engineering

# Handle NaNs
merged_data['Genres'] = merged_data['Genres'].fillna('').astype(str)

#Genres
merged_data['Genres'] = merged_data['Genres'].str.split('|')
mlb = pd.get_dummies(merged_data['Genres'].apply(pd.Series).stack()).groupby(level=0).sum()
merged_data_with_genres = pd.concat([merged_data, mlb], axis=1)

# Generate movie features
movie_features = merged_data_with_genres.drop(['Title', 'Year', 'Genres', 'primaryTitle', 'originalTitle', 'tconst', 'titleType', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres'], axis=1)

numeric_movie_features = movie_features.select_dtypes(include=[np.number])

numeric_movie_features = numeric_movie_features[numeric_movie_features['MovieID'].isin(aggregated_ratings['MovieID'])]

# Aggregate user features
user_features = aggregated_ratings.groupby('UserID').mean().reset_index()

# Ensure only numeric columns are selected for scaling
numeric_user_features = user_features.select_dtypes(include=[np.number])

# Normalize the features
scaler_movie = StandardScaler()
movie_features_scaled = scaler_movie.fit_transform(numeric_movie_features.drop('MovieID', axis=1))

scaler_user = StandardScaler()
user_features_scaled = scaler_user.fit_transform(numeric_user_features.drop('UserID', axis=1))

# Scale the ratings
scaler_rating = MinMaxScaler((-1, 1))
aggregated_ratings['Rating'] = scaler_rating.fit_transform(aggregated_ratings['Rating'].values.reshape(-1, 1))

print("Movie features scaled shape:", movie_features_scaled.shape)

Movie features scaled shape: (519, 1)


In [162]:
# Model Development

import tensorflow as tf
from tensorflow.keras.layers import Layer

class L2Normalization(Layer):
    def call(self, inputs):
        return tf.linalg.l2_normalize(inputs, axis=1)

# Create user and movie models with the custom L2 normalization layer
num_outputs = 32

# Movie model
item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(1,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
    L2Normalization()
])

# Create the movie input and point to the base network
input_item = tf.keras.layers.Input(shape=(1,))
vm = item_NN(input_item)

input_item2 = tf.keras.layers.Input(shape=(1,))
vm2 = item_NN(input_item2)

output = tf.keras.layers.Dot(axes=1)([vm, vm2])

model = tf.keras.Model([input_item, input_item2], output)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mean_squared_error')

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [163]:
# Find common MovieIDs
common_movie_ids = set(numeric_movie_features['MovieID']) & set(aggregated_ratings['MovieID'])
print(f"Number of common MovieIDs: {len(common_movie_ids)}")

# Filter both dataframes to include only these common MovieIDs
movie_features_filtered = numeric_movie_features[numeric_movie_features['MovieID'].isin(common_movie_ids)]
aggregated_ratings_filtered = aggregated_ratings[aggregated_ratings['MovieID'].isin(common_movie_ids)]

print(f"Shape of movie_features_filtered before handling duplicates: {movie_features_filtered.shape}")
print(f"Shape of aggregated_ratings_filtered before handling duplicates: {aggregated_ratings_filtered.shape}")

# Check for duplicates
print(f"Duplicate MovieIDs in movie_features_filtered: {movie_features_filtered['MovieID'].duplicated().sum()}")
print(f"Duplicate MovieIDs in aggregated_ratings_filtered: {aggregated_ratings_filtered['MovieID'].duplicated().sum()}")

movie_features_filtered = movie_features_filtered.groupby('MovieID').first().reset_index()

aggregated_ratings_filtered = aggregated_ratings_filtered.groupby('MovieID')['Rating'].mean().reset_index()

print(f"Shape of movie_features_filtered after handling duplicates: {movie_features_filtered.shape}")
print(f"Shape of aggregated_ratings_filtered after handling duplicates: {aggregated_ratings_filtered.shape}")

# Sort dataframes by MovieID
movie_features_filtered = movie_features_filtered.sort_values('MovieID').reset_index(drop=True)
aggregated_ratings_filtered = aggregated_ratings_filtered.sort_values('MovieID').reset_index(drop=True)

print(f"MovieIDs aligned: {(movie_features_filtered['MovieID'] == aggregated_ratings_filtered['MovieID']).all()}")

# Now create X_movie and y
X_movie = movie_features_filtered.drop('MovieID', axis=1).values
y = aggregated_ratings_filtered['Rating'].values

print(f"Shape of X_movie: {X_movie.shape}")
print(f"Shape of y: {y.shape}")

try:
    assert X_movie.shape[0] == len(y), "Inconsistent number of samples between X_movie and y"
    print("Assertion passed: X_movie and y have the same number of samples.")
except AssertionError as e:
    print(f"Assertion failed: {e}")
    print(f"X_movie shape: {X_movie.shape}, y length: {len(y)}")

Number of common MovieIDs: 330
Shape of movie_features_filtered before handling duplicates: (519, 2)
Shape of aggregated_ratings_filtered before handling duplicates: (223646, 3)
Duplicate MovieIDs in movie_features_filtered: 189
Duplicate MovieIDs in aggregated_ratings_filtered: 223316
Shape of movie_features_filtered after handling duplicates: (330, 2)
Shape of aggregated_ratings_filtered after handling duplicates: (330, 2)
MovieIDs aligned: True
Shape of X_movie: (330, 1)
Shape of y: (330,)
Assertion passed: X_movie and y have the same number of samples.


In [164]:
# Split into training and testing sets
X_movie_train, X_movie_test, y_train, y_test = train_test_split(X_movie, y, test_size=0.2, random_state=42)

# Fit the model
model.fit([X_movie_train, X_movie_train], y_train, epochs=2, batch_size=64, validation_data=([X_movie_test, X_movie_test], y_test))

Epoch 1/2
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 259ms/step - loss: 0.8635 - val_loss: 0.9274
Epoch 2/2
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.9064 - val_loss: 0.9274


<keras.src.callbacks.history.History at 0x7912d6965bd0>

In [165]:
from sklearn.metrics import mean_squared_error

# Evaluate the model
y_pred_train = model.predict([X_movie_train, X_movie_train])
y_pred_test = model.predict([X_movie_test, X_movie_test])

# Convert scaled ratings back to original scale
y_train_original = scaler_rating.inverse_transform(y_train.reshape(-1, 1))
y_test_original = scaler_rating.inverse_transform(y_test.reshape(-1, 1))
y_pred_train_original = scaler_rating.inverse_transform(y_pred_train)
y_pred_test_original = scaler_rating.inverse_transform(y_pred_test)

# Calculate RMSE
rmse_train = np.sqrt(mean_squared_error(y_train_original, y_pred_train_original))
rmse_test = np.sqrt(mean_squared_error(y_test_original, y_pred_test_original))

print(f"Train RMSE: {rmse_train}")
print(f"Test RMSE: {rmse_test}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Train RMSE: 1.8566486289616995
Test RMSE: 1.926040433254923


In [166]:
# Select example MovieIDs for testing
valid_movie_ids_for_testing = list(movie_id_to_index_map.keys())

print("Valid MovieIDs for Testing:", valid_movie_ids_for_testing)

Valid MovieIDs for Testing: [2, 10, 16, 18, 34, 39, 44, 69, 70, 76, 95, 111, 112, 145, 153, 160, 172, 173, 181, 193, 208, 216, 223, 235, 253, 260, 288, 292, 316, 330, 356, 379, 380, 390, 434, 441, 442, 456, 464, 480, 485, 504, 519, 532, 541, 586, 587, 588, 589, 592, 595, 596, 600, 648, 653, 673, 714, 798, 810, 837, 849, 891, 899, 903, 904, 910, 918, 920, 923, 931, 951, 953, 968, 973, 1022, 1028, 1029, 1030, 1032, 1036, 1080, 1089, 1097, 1101, 1126, 1129, 1193, 1196, 1198, 1199, 1200, 1203, 1207, 1210, 1213, 1214, 1219, 1225, 1227, 1252, 1255, 1256, 1259, 1265, 1270, 1274, 1275, 1276, 1278, 1282, 1287, 1288, 1291, 1304, 1311, 1336, 1339, 1340, 1341, 1342, 1345, 1359, 1367, 1372, 1376, 1377, 1387, 1388, 1389, 1394, 1407, 1438, 1483, 1518, 1552, 1562, 1566, 1573, 1580, 1587, 1591, 1599, 1655, 1673, 1681, 1688, 1704, 1707, 1721, 1748, 1754, 1760, 1762, 1805, 1809, 1831, 1835, 1882, 1884, 1895, 1907, 1945, 1954, 1955, 1956, 1958, 1961, 1964, 1973, 1974, 1975, 1977, 1978, 1979, 1982, 1983, 1

In [167]:
# Recommendation Algorithm / Evaluation

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Function to get the index of a movie from its MovieID
def get_movie_index(movie_id, movie_features_merged):
    movie_row = movie_features_merged[movie_features_merged['MovieID'] == movie_id]
    if movie_row.empty:
        print(f"MovieID {movie_id} not found in the dataset")
        return None
    return movie_row.index[0]

def recommend_movies_based_on_two_liked(movie_id1, movie_id2, model, movie_features_merged, movie_features_scaled):
    index1 = get_movie_index(movie_id1, movie_features_merged)
    index2 = get_movie_index(movie_id2, movie_features_merged)
    
    if index1 is None or index2 is None:
        return pd.DataFrame()

    movie1_features = movie_features_scaled[index1].reshape(1, -1)
    movie2_features = movie_features_scaled[index2].reshape(1, -1)
    
    # Aggregate the features of the two liked movies
    combined_features = (movie1_features + movie2_features) / 2
    
    # Predict ratings for all movies
    all_predictions_scaled = model.predict([np.tile(combined_features, (movie_features_scaled.shape[0], 1)), movie_features_scaled])
    
    print("Raw predictions (first 5):", all_predictions_scaled[:5])
    print("Min prediction:", np.min(all_predictions_scaled))
    print("Max prediction:", np.max(all_predictions_scaled))
    
    # If using MinMaxScaler, adjust the inverse_transform
    if isinstance(scaler_rating, MinMaxScaler):
        all_predictions = scaler_rating.inverse_transform(all_predictions_scaled.reshape(-1, 1)).flatten()
    else:
        all_predictions = scaler_rating.inverse_transform(all_predictions_scaled)
    
    return all_predictions


# After getting the recommendations
if len(predicted_ratings) > 0:
    print("Predicted ratings range:")
    print("Min:", np.min(predicted_ratings))
    print("Max:", np.max(predicted_ratings))
    print("Mean:", np.mean(predicted_ratings))
    print("Median:", np.median(predicted_ratings))

    # Create a DataFrame for the predicted ratings
    predicted_ratings_df = pd.DataFrame({'MovieID': movie_features_merged['MovieID'], 'PredictedRating': predicted_ratings})

    # Exclude the movies that are already liked
    predicted_ratings_df = predicted_ratings_df[~predicted_ratings_df['MovieID'].isin([movie_id1, movie_id2])]

    # Get the top 10 recommended movies
    top_recommendations = predicted_ratings_df.sort_values(by='PredictedRating', ascending=False).head(10)

    # Merge with the movies DataFrame to get movie titles
    top_recommendations = pd.merge(top_recommendations, movies[['MovieID', 'Title']], on='MovieID')

    print("\nTop 10 Recommended Movies:")
    print(top_recommendations[['Title', 'PredictedRating']])

    # Print bottom 5 recommendations for comparison
    bottom_recommendations = predicted_ratings_df.sort_values(by='PredictedRating', ascending=True).head(5)
    bottom_recommendations = pd.merge(bottom_recommendations, movies[['MovieID', 'Title']], on='MovieID')
    print("\nBottom 5 Recommended Movies:")
    print(bottom_recommendations[['Title', 'PredictedRating']])

Predicted ratings range:
Min: 5.0
Max: 5.0
Mean: 5.0
Median: 5.0

Top 10 Recommended Movies:
               Title  PredictedRating
0      casino (1995)              5.0
1  four rooms (1995)              5.0
2        babe (1995)              5.0
3    clueless (1995)              5.0

Bottom 5 Recommended Movies:
Empty DataFrame
Columns: [Title, PredictedRating]
Index: []
