In [213]:
import pandas as pd
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict, Counter

In [214]:
#pip install surprise pandas

# **Data Collection and Preprocessing** #

## Import data ##

### MovieLens Dataset  ###

In [215]:
%%bash

if [ ! -d "../data/movielens_complete" ]; 
then    
    wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
    mkdir -p ../data/movielens_complete
    unzip -o ml-1m.zip -d ../data/movielens_complete;
    rm ml-1m.zip;
else
    echo "Data already downloaded";
fi

Data already downloaded


### IMDb Dataset ###

In [216]:
%%bash

if [ ! -d "../data/imbws_complete" ]; 
then    
    wget https://datasets.imdbws.com/name.basics.tsv.gz
    wget https://datasets.imdbws.com/title.akas.tsv.gz
    wget https://datasets.imdbws.com/title.basics.tsv.gz
    wget https://datasets.imdbws.com/title.crew.tsv.gz
    wget https://datasets.imdbws.com/title.episode.tsv.gz
    wget https://datasets.imdbws.com/title.principals.tsv.gz
    wget https://datasets.imdbws.com/title.ratings.tsv.gz
    mkdir -p ../data/imbws_complete
    unzip -o * -d ../data/imbws_complete;
    rm -rf *.gz;
else
    echo "Data already downloaded";
fi

Data already downloaded


## Loading data ##

### IMDb Dataset ###

In [217]:
title_basics = pd.read_csv(
    "../data/imbws_complete/title.basics.tsv",
    sep="\t",
    encoding="utf-8",
    on_bad_lines='skip'
)

title_crew = pd.read_csv(
    "../data/imbws_complete/title.crew.tsv", 
    sep="\t", 
    encoding="utf-8", 
    on_bad_lines='skip'
)

  title_basics = pd.read_csv(


### MovieLens ###

In [218]:
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

ratings = pd.read_csv('../data/movielens_complete/ml-1m/ratings.dat',
    sep='::',
    names=ratings_columns, 
    engine='python'
)

movies_columns = ['MovieID', 'Title', 'Genres']

movies = pd.read_csv('../data/movielens_complete/ml-1m/movies.dat', 
    sep='::', 
    names=movies_columns,
    engine='python', 
    encoding='ISO-8859-1'
)

## Feature Engineering and Merging ##

In [219]:
title_basics = title_basics[['tconst', 'primaryTitle', 'startYear', 'genres']]
title_basics.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres
0,tt0000001,Carmencita,1894,"Documentary,Short"
1,tt0000002,Le clown et ses chiens,1892,"Animation,Short"
2,tt0000003,Pauvre Pierrot,1892,"Animation,Comedy,Romance"
3,tt0000004,Un bon bock,1892,"Animation,Short"
4,tt0000005,Blacksmith Scene,1893,"Comedy,Short"


In [220]:
title_crew = title_crew[['tconst', 'directors']]
title_crew.head()

Unnamed: 0,tconst,directors
0,tt0000001,nm0005690
1,tt0000002,nm0721526
2,tt0000003,nm0721526
3,tt0000004,nm0721526
4,tt0000005,nm0005690


In [221]:
# Split Title ... (year) to Title and Year Columns in order to merge Title
movies[['Title', 'Year']] = movies['Title'].str.extract(r'^(.*) \((\d{4})\)$')
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [222]:
title_data = pd.merge(title_basics, title_crew, on='tconst', how='left')
title_data.rename(columns={'primaryTitle': 'Title', 'startYear': 'Year', 'directors': 'Directors'}, inplace=True)
title_data.head()

Unnamed: 0,tconst,Title,Year,genres,Directors
0,tt0000001,Carmencita,1894,"Documentary,Short",nm0005690
1,tt0000002,Le clown et ses chiens,1892,"Animation,Short",nm0721526
2,tt0000003,Pauvre Pierrot,1892,"Animation,Comedy,Romance",nm0721526
3,tt0000004,Un bon bock,1892,"Animation,Short",nm0721526
4,tt0000005,Blacksmith Scene,1893,"Comedy,Short",nm0005690


In [223]:
merged_data = pd.merge(movies, title_data, on=['Title', 'Year'], how='inner')
merged_data = merged_data[['MovieID', 'Title', 'Year', 'Genres', 'Directors', 'genres']]
merged_data.head()

Unnamed: 0,MovieID,Title,Year,Genres,Directors,genres
0,1,Toy Story,1995,Animation|Children's|Comedy,nm0005124,"Adventure,Animation,Comedy"
1,2,Jumanji,1995,Adventure|Children's|Fantasy,nm0002653,"Adventure,Comedy,Family"
2,3,Grumpier Old Men,1995,Comedy|Romance,nm0222043,"Comedy,Romance"
3,4,Waiting to Exhale,1995,Comedy|Drama,nm0001845,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II,1995,Comedy,nm0796124,"Comedy,Family,Romance"


### Genres concatenation ###

In [224]:
def add_imbwGenres_to_Genres(row):
    imbw_genres = str(row['genres']).split(',') if pd.notna(row['genres']) else []
    genres = str(row['Genres']).split('|') if pd.notna(row['Genres']) else []
    
    # Add genres from IMDb Genres that are not already in MovieLens Genres
    for genre in imbw_genres:
        if genre.strip() not in genres:
            genres.append(genre.strip())
    
    # Combine genres by '|'
    return '|'.join(genres)

In [225]:

merged_data['Genres'] = merged_data.apply(add_imbwGenres_to_Genres, axis=1)
merged_data = merged_data.drop(['genres'], axis=1)
merged_data.head()

Unnamed: 0,MovieID,Title,Year,Genres,Directors
0,1,Toy Story,1995,Animation|Children's|Comedy|Adventure,nm0005124
1,2,Jumanji,1995,Adventure|Children's|Fantasy|Comedy|Family,nm0002653
2,3,Grumpier Old Men,1995,Comedy|Romance,nm0222043
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance,nm0001845
4,5,Father of the Bride Part II,1995,Comedy|Family|Romance,nm0796124


# **Model Development** #

## SVD model ##

In [226]:
model = SVD(n_factors=59, biased=True, verbose=True)

# **Recommendation Algorithm** #

## Build Users Preferences

In [227]:
# Preference on Genres and Directors preferences
# But We can add other features like actors, Years etc ...
def build_user_genre_and_director_preferences(ratings, movies):
    # Merging for Genres and Directors ratings
    ratings = pd.merge(ratings, movies[['MovieID', 'Genres', 'Directors']], on='MovieID', how='left')
    
    # Dictionaries to store preferences of genre and director
    user_genre_counts = defaultdict(Counter)
    user_director_counts = defaultdict(Counter)
    total_ratings = defaultdict(int)
    
    # Iterate through each rating record
    for _, row in ratings.iterrows():
        user_id = row['UserID']

        if isinstance(row['Genres'], str):
            genres = row['Genres'].split('|')  
        else:
            genres = []

        if isinstance(row['Directors'], str):
            directors = row['Directors'].split(',')
        else:
            directors = []

        rating = row['Rating']
        
        # Add genre and directors preferences if rating is >= 4
        if rating >= 4.0:
            user_genre_counts[user_id].update(genres)

        if rating >= 4.0:
            user_director_counts[user_id].update(directors)
        
        # Count total ratings per user
        total_ratings[user_id] += 1

    user_genre_preferences = {user_id: [genre for genre, _ in genres.items() if total_ratings[user_id] >= 20 and genre != '\\N'] for user_id, genres in user_genre_counts.items()}
    user_director_preferences = {user_id: [director for director, _ in directors.items() if total_ratings[user_id] >= 20 and director != '\\N'] for user_id, directors in user_director_counts.items()}
    
    top_genres_preferences = {}
    top_directors_preferences = {}
    
    # Calculate the top 5 genres and 5 directors for each user
    for user_id, genres in user_genre_counts.items():
        if total_ratings[user_id] >= 20:
            top_genres = genres.most_common(5)
            top_genres_preferences[user_id] = [genre for genre, _ in top_genres]

    for user_id, directors in user_director_counts.items():
        if total_ratings[user_id] >= 20:
            top_directors = directors.most_common(5)
            top_directors_preferences[user_id] = [director for director, _ in top_directors if director != '\\N']

    return user_genre_preferences, user_director_preferences, top_genres_preferences, top_directors_preferences

# Get couple preferences
def combine_user_preferences(user_preferences, user1_id, user2_id):
    user1_preferences = user_preferences[user1_id]
    user2_preferences = user_preferences[user2_id]
    combined_preferences = list(set(user1_preferences).intersection(user2_preferences))
    return combined_preferences

## Couple Recommendations

In [228]:
def get_couple_recommendations(model, user_genre_preferences, user_director_preferences, ratings, movies, user1_id, user2_id, n=3):
    # Combine user preferences
    combined_genres = combine_user_preferences(user_genre_preferences, user1_id, user2_id)
    combined_directors = combine_user_preferences(user_director_preferences, user1_id, user2_id)
    
    # Get movies already rated by users
    rated_movies_user1 = ratings[ratings['UserID'] == user1_id]['MovieID'].tolist()
    rated_movies_user2 = ratings[ratings['UserID'] == user2_id]['MovieID'].tolist()
    rated_movies = rated_movies_user1 + rated_movies_user2
    
    # Get predictions for all movies
    all_movies = list(movies['MovieID'])
    predictions = [model.predict(user1_id, movie_id, r_ui=0, clip=False) for movie_id in all_movies if movie_id not in rated_movies]
    
    # Sort predictions by estimated rating in descending order
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Select top recommendations that match combined genre and director preferences
    top_recommendations = []
    added_movie_ids = set()  # To track added movies and avoid duplicates
    
    for prediction in predictions:
        movie_id = prediction.iid
        movie_info = movies[movies['MovieID'] == movie_id].iloc[0]
        movie_genres = movie_info['Genres'].split('|') if isinstance(movie_info['Genres'], str) else []
        movie_directors = movie_info['Directors'].split(',') if isinstance(movie_info['Directors'], str) else []
        
        # Check if the movie matches the combined genre and director preferences
        if (any(genre in combined_genres for genre in movie_genres) and
            any(director in combined_directors for director in movie_directors) and
            movie_id not in added_movie_ids):
            
            top_recommendations.append((movie_id, prediction.est))
            added_movie_ids.add(movie_id)  # Track the added movie ID
        
        # Exit once we have enough recommendations
        if len(top_recommendations) >= n:
            break

    return top_recommendations

# **Evaluation** #

### Training ###

In [229]:
# Use Surprise to train SVD model
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['UserID', 'MovieID', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x5cdfc8fe0>

### Build preferences ###

In [230]:
user_genre_preferences, user_director_preferences, top_genres_preferences, top_directors_preferences = build_user_genre_and_director_preferences(ratings, merged_data)
print(top_genres_preferences)
print(top_directors_preferences)

{1: ["Children's", 'Animation', 'Drama', 'Comedy', 'Adventure'], 2: ['Drama', 'Action', 'Adventure', 'Romance', 'Thriller'], 3: ['Adventure', 'Action', 'Comedy', 'Drama', 'Sci-Fi'], 4: ['Action', 'Adventure', 'Sci-Fi', 'Drama', 'Thriller'], 5: ['Drama', 'Comedy', 'Romance', 'Crime', 'Thriller'], 6: ['Comedy', 'Drama', 'Romance', 'Musical', "Children's"], 7: ['Action', 'Adventure', 'Thriller', 'Sci-Fi', 'Drama'], 8: ['Drama', 'Action', 'Romance', 'Adventure', 'Thriller'], 9: ['Drama', 'Comedy', 'Action', 'Adventure', 'Sci-Fi'], 10: ['Drama', 'Comedy', 'Adventure', 'Action', 'Romance'], 11: ['Comedy', 'Drama', 'Adventure', 'Action', 'Crime'], 12: ['Drama', 'Adventure', 'Action', 'Crime', 'Family'], 13: ['Adventure', 'Action', 'Sci-Fi', 'Drama', 'Thriller'], 14: ['Drama', 'Comedy', 'Action', 'Adventure', 'Fantasy'], 15: ['Drama', 'Action', 'Thriller', 'Adventure', 'Crime'], 16: ['Comedy', 'Romance', 'Drama', 'Thriller', 'Action'], 17: ['Sci-Fi', 'Drama', 'Action', 'Adventure', 'Thriller']

### Choice of couple ###

In [238]:
import random

unique_user_ids = ratings['UserID'].unique()
user1_id = random.choice(unique_user_ids)
user2_id = random.choice(unique_user_ids)

def get_top_5_genres(user_genre_preferences, user_id):
    if user_id in user_genre_preferences:
        genres = user_genre_preferences[user_id]
        return genres
    else:
        return []
    
three_genres_user1 = get_top_5_genres(top_genres_preferences, user1_id)
three_genres_user2 = get_top_5_genres(top_genres_preferences, user2_id)

print(f"5 genres recommanded for User {user1_id}: {three_genres_user1}")
print(f"5 genres recommanded for User {user2_id}: {three_genres_user2}")

5 genres recommanded for User 3503: ['Drama', 'Comedy', 'Adventure', 'Action', 'Thriller']
5 genres recommanded for User 3669: ['Drama', 'Action', 'Comedy', 'Adventure', 'Crime']


### Movie Recommendations ###

In [239]:
predictions = model.test(testset)
recommendations = get_couple_recommendations(model, user_genre_preferences, user_director_preferences,ratings, merged_data, user1_id, user2_id)
for movie_id, predicted_rating in recommendations:
    movie_title = movies[movies['MovieID'] == movie_id]['Title'].iloc[0]
    movie_genres = movies[movies['MovieID'] == movie_id]['Genres'].iloc[0]
    print(f"Movie Title: {movie_title}, Predicted Rating: {predicted_rating}, MovieID: {movie_id}, Genres: {movie_genres}")

Movie Title: Paths of Glory, Predicted Rating: 4.778726014241611, MovieID: 1178, Genres: Drama|War
Movie Title: Diner, Predicted Rating: 4.757275702386452, MovieID: 3543, Genres: Comedy|Drama
Movie Title: Touch of Evil, Predicted Rating: 4.74180720629701, MovieID: 1248, Genres: Crime|Film-Noir|Thriller


It is normal to get "Predicted Rating" a little higher than 5, because it's rating for 5

In [233]:
accuracy.rmse(predictions)

RMSE: 0.8720


0.8720373567664572