# Imports

In [63]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Data Import

In [64]:
# Define file paths
movielens_ratings_path = "/kaggle/input/dataset/ml-1m/ml-1m/ratings.dat"
movielens_movies_path = "/kaggle/input/dataset/ml-1m/ml-1m/movies.dat"
movielens_users_path = "/kaggle/input/dataset/ml-1m/ml-1m/users.dat"
imdb_title_path = "/kaggle/input/dataset/title.basics.tsv"

# Load datasets
ratings_copy = pd.read_csv(movielens_ratings_path, delimiter="::", engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
movies_copy = pd.read_csv(movielens_movies_path, delimiter="::", engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
users_copy = pd.read_csv(movielens_users_path, delimiter="::", engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')
imdb_title_copy = pd.read_csv(imdb_title_path, delimiter='\t', low_memory=False)

# Data Preprocessing

In [65]:
imdb_title_copy = imdb_title_copy[imdb_title_copy['titleType'] == 'movie']

ratings_copy.drop(columns=['Timestamp'], inplace=True)
users_copy.drop(columns=['Zip-code'], inplace=True)
imdb_title_copy.drop(columns=['primaryTitle', 'endYear', 'genres'], inplace=True)

In [66]:
# Step 1: Data Cleaning
# Convert relevant columns to appropriate types and handle missing values
imdb_title_copy.replace('\\N', pd.NA, inplace=True)

# Create 'Year' and Clean 'Title' columns in movies_copy
movies_copy['Year'] = movies_copy['Title'].str[-5:-1]
movies_copy['Title'] = movies_copy['Title'].str[:-7]

# Convert year columns to numeric
imdb_title_copy['startYear'] = pd.to_numeric(imdb_title_copy['startYear'], errors='coerce')
movies_copy['Year'] = pd.to_numeric(movies_copy['Year'], errors='coerce')

In [67]:
ratings_copy['MovieID'] = ratings_copy['MovieID'].astype(str)
movies_copy['MovieID'] = movies_copy['MovieID'].astype(str)
imdb_title_copy['tconst'] = imdb_title_copy['tconst'].astype(str)
imdb_title_copy['originalTitle'] = imdb_title_copy['originalTitle'].astype(str)

In [68]:
# Step 2: Merge Datasets
# Merge ratings with movies to get movie details with ratings
ratings_movies = pd.merge(ratings_copy, movies_copy, on='MovieID', how='left')
del ratings_copy
del movies_copy

print(ratings_movies.describe)

<bound method NDFrame.describe of          UserID MovieID  Rating                            Title  \
0             1    1193       5  One Flew Over the Cuckoo's Nest   
1             1     661       3        James and the Giant Peach   
2             1     914       3                     My Fair Lady   
3             1    3408       4                  Erin Brockovich   
4             1    2355       5                    Bug's Life, A   
...         ...     ...     ...                              ...   
1000204    6040    1091       1              Weekend at Bernie's   
1000205    6040    1094       5                 Crying Game, The   
1000206    6040     562       5         Welcome to the Dollhouse   
1000207    6040    1096       4                  Sophie's Choice   
1000208    6040    1097       4       E.T. the Extra-Terrestrial   

                                  Genres  Year  
0                                  Drama  1975  
1           Animation|Children's|Musical  1996  
2 

In [69]:
# Convert to lowercase and strip whitespace for better matching
ratings_movies['Title'] = ratings_movies['Title'].str.lower().str.strip()
imdb_title_copy['originalTitle'] = imdb_title_copy['originalTitle'].str.lower().str.strip()

# Remove any unwanted characters and normalize titles (optional, but can improve matching)
def normalize_title(title):
    title = re.sub(r'\([^)]*\)', '', title)  # Remove anything in parentheses
    title = re.sub(r'[^a-z0-9\s]', '', title)  # Remove any special characters
    title = re.sub(r'\s+', ' ', title)  # Replace multiple spaces with a single space
    return title.strip()

ratings_movies['normalizedTitle'] = ratings_movies['Title'].apply(normalize_title)
imdb_title_copy['normalizedTitle'] = imdb_title_copy['originalTitle'].apply(normalize_title)

# Convert 'Year' and 'startYear' to strings to ensure matching
ratings_movies['Year'] = ratings_movies['Year'].astype(str)
imdb_title_copy['startYear'] = imdb_title_copy['startYear'].fillna(0).astype(int).astype(str)

# Merge the datasets on the normalized title and year
ratings_movies_imdb = pd.merge(
    ratings_movies, 
    imdb_title_copy, 
    left_on=['normalizedTitle', 'Year'], 
    right_on=['normalizedTitle', 'startYear'], 
    how='inner'
)
del imdb_title_copy
del ratings_movies

# Drop unnecessary columns if no longer needed
ratings_movies_imdb.drop(columns=['normalizedTitle', 'startYear'], inplace=True)

# Display the merged dataset
print(ratings_movies_imdb.describe)

<bound method NDFrame.describe of         UserID MovieID  Rating                            Title  \
0            1    1193       5  one flew over the cuckoo's nest   
1            1     661       3        james and the giant peach   
2            1     914       3                     my fair lady   
3            1    3408       4                  erin brockovich   
4            1    1287       5                          ben-hur   
...        ...     ...     ...                              ...   
705971    6040    1090       3                          platoon   
705972    6040    1091       1              weekend at bernie's   
705973    6040     562       5         welcome to the dollhouse   
705974    6040    1096       4                  sophie's choice   
705975    6040    1097       4       e.t. the extra-terrestrial   

                                 Genres  Year     tconst titleType  \
0                                 Drama  1975  tt0073486     movie   
1          Animation|

In [70]:
# Merge with users to get user details with ratings and movies
ratings_movies_users = pd.merge(ratings_movies_imdb, users_copy, on='UserID', how='left')
del users_copy
del ratings_movies_imdb

ratings_movies_users.drop(columns=['MovieID', 'originalTitle', 'tconst'], inplace=True)

In [71]:
# Display the merged dataset
print(ratings_movies_users.head())

   UserID  Rating                            Title  \
0       1       5  one flew over the cuckoo's nest   
1       1       3        james and the giant peach   
2       1       3                     my fair lady   
3       1       4                  erin brockovich   
4       1       5                          ben-hur   

                         Genres  Year titleType isAdult runtimeMinutes Gender  \
0                         Drama  1975     movie       0            133      F   
1  Animation|Children's|Musical  1996     movie       0             79      F   
2               Musical|Romance  1964     movie       0            170      F   
3                         Drama  2000     movie       0            131      F   
4        Action|Adventure|Drama  1959     movie       0            212      F   

   Age  Occupation  
0    1          10  
1    1          10  
2    1          10  
3    1          10  
4    1          10  


In [72]:
all_data = ratings_movies_users[['Rating', 'Gender', 'Age', 'Occupation', 'Genres', 'Year', 'runtimeMinutes', 'isAdult']]

# Display the filtered DataFrame
print(all_data)

        Rating Gender  Age  Occupation                           Genres  Year  \
0            5      F    1          10                            Drama  1975   
1            3      F    1          10     Animation|Children's|Musical  1996   
2            3      F    1          10                  Musical|Romance  1964   
3            4      F    1          10                            Drama  2000   
4            5      F    1          10           Action|Adventure|Drama  1959   
...        ...    ...  ...         ...                              ...   ...   
705971       3      M   25           6                        Drama|War  1986   
705972       1      M   25           6                           Comedy  1989   
705973       5      M   25           6                     Comedy|Drama  1995   
705974       4      M   25           6                            Drama  1982   
705975       4      M   25           6  Children's|Drama|Fantasy|Sci-Fi  1982   

       runtimeMinutes isAdu

# User-Item interaction matrix

In [73]:
# Group by UserID and Title to handle duplicates by averaging the ratings
ratings_aggregated = ratings_movies_users.groupby(['UserID', 'Title'], as_index=False)['Rating'].mean()

# Pivot the DataFrame to create the user-item interaction matrix
interaction_matrix = ratings_aggregated.pivot(index='UserID', columns='Title', values='Rating').fillna(0)

print(interaction_matrix)

Title   'night mother  'til there was you  ...and justice for all  \
UserID                                                              
1                 0.0                 0.0                     0.0   
2                 0.0                 0.0                     0.0   
3                 0.0                 0.0                     0.0   
4                 0.0                 0.0                     0.0   
5                 0.0                 0.0                     0.0   
...               ...                 ...                     ...   
6036              3.0                 0.0                     0.0   
6037              0.0                 0.0                     0.0   
6038              0.0                 0.0                     0.0   
6039              0.0                 0.0                     0.0   
6040              0.0                 0.0                     0.0   

Title   10 things i hate about you  101 dalmatians  12 angry men  \
UserID                            

# Create Prediction Model

In [74]:
def preprocess_data(all_data):
    # Create label encoders
    gender_encoder = LabelEncoder()
    genres_encoder = LabelEncoder()

    # Fill missing values (example: fill with median for Age and Occupation)
    all_data['Age'] = pd.to_numeric(all_data['Age'], errors='coerce')
    all_data['Occupation'] = pd.to_numeric(all_data['Occupation'], errors='coerce')
    all_data['Year'] = pd.to_numeric(all_data['Year'], errors='coerce')
    all_data['runtimeMinutes'] = pd.to_numeric(all_data['runtimeMinutes'], errors='coerce')
    all_data['isAdult'] = pd.to_numeric(all_data['isAdult'], errors='coerce')
    
    # Fill NaN values with 0 in numeric columns
    numeric_columns = ['Age', 'Occupation', 'Year', 'runtimeMinutes', 'isAdult']
    all_data[numeric_columns] = all_data[numeric_columns].fillna(0)

    # Ensure that 'Genres' are strings before encoding
    all_data['Genres'] = all_data['Genres'].astype(str)

    # Label encode 'Gender'
    all_data['Gender'] = gender_encoder.fit_transform(all_data['Gender'])

    # Label encode 'Genres'
    all_data['Genres'] = genres_encoder.fit_transform(all_data['Genres'])

    # Standardize 'Age', 'Occupation', 'Year', and 'runtimeMinutes'
    scaler = StandardScaler()
    all_data[['Age', 'Occupation', 'Year', 'runtimeMinutes', 'isAdult']] = scaler.fit_transform(all_data[['Age', 'Occupation', 'Year', 'runtimeMinutes', 'isAdult']])

    # Combine all features
    X = np.concatenate([all_data[['Gender', 'Genres', 'Age', 'Occupation', 'Year', 'runtimeMinutes', 'isAdult']].values], axis=1)
    
    return X

In [75]:
X = preprocess_data(ratings_movies_users)
y = all_data['Rating'].values

In [76]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict ratings
predictions = model.predict(X_test)

# Recommendation

In [78]:
def get_user_info(user_id, ratings_movies_users):
    user_info = ratings_movies_users[ratings_movies_users['UserID'] == user_id]
    return user_info[['Gender', 'Age', 'Occupation']].drop_duplicates()

def get_movie_info(title, ratings_movies_users):
    movie_info = ratings_movies_users[ratings_movies_users['Title'] == title]
    return movie_info[['Genres', 'Year', 'runtimeMinutes', 'isAdult']].drop_duplicates()

In [79]:
# Main process to get and preprocess movie information
def get_unrated_movie_features(user_info, interaction_matrix, ratings_movies_users, user_id):
    # Get the list of unrated movies for the user
    unrated_movies = interaction_matrix.loc[user_id][interaction_matrix.loc[user_id] == 0].index.tolist()
    
    # Initialize an empty list to store features
    movie_features_list = []

    for movie in tqdm(unrated_movies, desc="Processing unrated movies"):  # Add tqdm here
        # Get movie information
        movie_info = get_movie_info(movie, ratings_movies_users)
        
        # Add user demographic information (for preprocessing purposes)
        all_data = pd.concat([user_info, movie_info], axis=1)
        
        # Preprocess the data
        movie_features = preprocess_data(all_data)
        
        # Append the features to the list
        movie_features_list.append(movie_features)
    
    # Combine all features into a single numpy array
    X = np.concatenate(movie_features_list, axis=0)
    
    return X, unrated_movies

In [80]:
def get_top_recommendations(user1_id, user2_id, model, interaction_matrix, ratings_movies_users):
    # Get user information
    user1_info = get_user_info(user1_id, ratings_movies_users)
    user2_info = get_user_info(user2_id, ratings_movies_users)

    # Get combination of user information and movie features
    user1_combination, user1_unrated_movies = get_unrated_movie_features(user1_info, interaction_matrix, ratings_movies_users, user1_id)
    user2_combination, user2_unrated_movies = get_unrated_movie_features(user2_info, interaction_matrix, ratings_movies_users, user2_id)

    # Get predictions
    user1_predictions = model.predict(user1_combination)
    user2_predictions = model.predict(user2_combination)

    # Find common movies
    common_movies = set(user1_unrated_movies).intersection(set(user2_unrated_movies))

    # Initialize list to store averaged predictions
    averaged_predictions = []

    # Calculate average predictions for common movies
    for movie in common_movies:
        idx1 = user1_unrated_movies.index(movie)
        idx2 = user2_unrated_movies.index(movie)

        avg_prediction = (user1_predictions[idx1] + user2_predictions[idx2]) / 2
        averaged_predictions.append((movie, avg_prediction))

    # Convert to DataFrame for better readability
    averaged_predictions_df = pd.DataFrame(averaged_predictions, columns=['Movie', 'Average_Prediction'])

    # Sort the averaged predictions and get the top
    top_prediction = averaged_predictions_df.sort_values(by='Average_Prediction', ascending=False).head(1)

    return top_prediction

In [81]:
# Usage example
user1_id = 1
user2_id = 2
top_5_recommendations = get_top_recommendations(user1_id, user2_id, model, interaction_matrix, ratings_movies_users)
print(top_5_recommendations)

Processing unrated movies: 100%|██████████| 2282/2282 [05:41<00:00,  6.68it/s]
Processing unrated movies: 100%|██████████| 2231/2231 [05:37<00:00,  6.62it/s]


                         Movie  Average_Prediction
805                 bloodsport            3.691474
917                 flatliners            3.632665
2150  someone to watch over me            3.515424
642                    déjà vu            3.491369
306             one false move            3.491369
