## Import libraries

In [1]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
import os
from warnings import filterwarnings
import random
import joblib

In [2]:
filterwarnings('ignore')

## Import data

In [3]:
os.listdir('movie_data_collaborative')

['movies.csv', 'ratings.csv']

In [4]:
movies = pd.read_csv('./movie_data_collaborative/movies.csv')
ratings = pd.read_csv('./movie_data_collaborative/ratings.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.shape

(10329, 3)

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [8]:
ratings.shape

(105339, 4)

## Data preprocessing

In [9]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
movies.duplicated().sum()

0

In [12]:
ratings.duplicated().sum()

0

## Merging two dataframes 

In [13]:
data = movies.merge(ratings, on = 'movieId', how = 'inner')   # Take only those movies which is being rated by the users
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286


In [14]:
data.drop(columns = ['genres', 'timestamp'], inplace = True)

In [15]:
data.duplicated().sum()

0

In [16]:
popular_users_userid = data.groupby('userId').size() >= 50     # Select only those users who has voted for at least 50 times
popular_users = popular_users_userid[popular_users_userid].index     
len(popular_users)   # Total 426 users 

426

In [17]:
data = data[data['userId'].isin(popular_users)]    # Selecting only the data for the popular users

In [18]:
data.head()

Unnamed: 0,movieId,title,userId,rating
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),8,5.0
3,1,Toy Story (1995),11,4.0
5,1,Toy Story (1995),17,5.0
6,1,Toy Story (1995),28,3.0


In [19]:
movies.set_index('movieId', inplace=True)

- Dropping the `title` column and `movieId` is the index

In [20]:
data.set_index("movieId", inplace=True)
data.drop(columns = ['title'], inplace = True)

- Changing the representation of the data by pivotting
- `Movie X User` matrix

In [21]:
movie_user = data.pivot(columns = 'userId', values = 'rating')
movie_user.head()

userId,1,3,4,5,6,7,8,9,11,15,...,656,657,659,661,662,664,665,666,667,668
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,4.0,,,5.0,,4.0,,...,,,,4.0,5.0,,,,,3.0
2,,,,,,,,,,,...,3.0,,,,5.0,,,,,3.0
3,,,,,,,4.0,3.0,,,...,,,,3.0,,,,,,2.0
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,3.0,,,,...,3.0,,,3.0,,,,,,2.5


In [22]:
joblib.dump(movie_user, "./movie_user_matrix.pkl")

['./movie_user_matrix.pkl']

## **`Recommender function`**

In [23]:
def movie_recommend_user(user_id):
    user_list = movie_user.columns
    similar_users = []
    for i in range(len(user_list)):    # For all user indexes (0 to 425)
        user_user_data = movie_user.iloc[:, [user_id, i]].dropna(how = 'any').T    # user user_id and user i common movie ratings
        if(len(user_user_data.columns) > 30):    # At least they must vote in same 30 movies
            cosine_sim = cosine_similarity(user_user_data)[0][1]
            if(cosine_sim > 0.75):    # Take only if the cosine similarity is > 0.75
                similar_users.append(i)

    null_movies = movie_user.loc[movie_user.iloc[:,user_id].isna()]   # The movies which is not rated by user `user_id`, and the columns of similar users

    movie_set = {}
    
    for user in similar_users:   
        x = null_movies.iloc[:, user]     # Fetch columns of similar users
        top_rated_movies = x[x>=4].reset_index()   # Only pick the movies with >= 4 ratings by the similar user

        for _, movie_id, rating in top_rated_movies.itertuples():  # For each rated (movies, ratings) by similar users
            if(movie_set.get(movie_id, -1)) == -1:   # If the movie is not present in movie_set, add it
                movie_set[movie_id] = rating
            else:
                movie_set[movie_id] = np.mean([movie_set[movie_id], rating])  # If present, take the mean of ratings as value

    top_5_movies = [movie for movie, _ in sorted(movie_set.items(), key = lambda x: x[1], reverse = True)][:5]    # Use the rating values to sort in descending order, and getting top 5 movie ids

    top_5_movie_names = movies.loc[top_5_movies, 'title'].to_numpy()   # Movie Ids
    movie_id_name_dict = dict(zip(top_5_movies, top_5_movie_names))
    
    return user_id, movie_id_name_dict

    

### Test for a random user

In [34]:
user_id, top_movies = movie_recommend_user(random.randint(0, 426))  # 0 to 425

c = 1
styler = ""

print(f"For User {user_id}, recommended movies are:")
print(f"{styler:.<50}")
for movie in top_movies.values():
    print(f"Movie{c}: {movie}")
    print(f"{styler:.<50}")
    c+=1

For User 345, recommended movies are:
..................................................
Movie1: Plague Dogs, The (1982)
..................................................
Movie2: United 93 (2006)
..................................................
Movie3: Quiet Man, The (1952)
..................................................
Movie4: Paper, The (1994)
..................................................
Movie5: Endless Summer 2, The (1994)
..................................................
