One of the most popular methods for making recommendations is collaborative filtering. In collaborative filtering, you are using the collaboration of user-item recommendations to assist in making new recommendations.

There are two main methods of performing collaborative filtering:

1. **Neighborhood-Based Collaborative Filtering**, which is based on the idea that we can either correlate items that are similar to provide recommendations or we can correlate users to one another to provide recommendations.


2. **Model Based Collaborative Filtering**, which is based on the idea that we can use machine learning and other mathematical models to understand the relationships that exist amongst items and users to predict ratings and provide ratings.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import progressbar

%matplotlib inline

# Read in the datasets
movies = pd.read_csv('Data/movies_clean.csv')
reviews = pd.read_csv('Data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

print(reviews.head())

   user_id  movie_id  rating   timestamp                 date
0        1    114508       8  1381006850  2013-10-06 02:30:50
1        2    499549       9  1376753198  2013-08-17 20:56:38
2        2   1305591       8  1376742507  2013-08-17 17:58:27
3        2   1428538       1  1371307089  2013-06-15 20:08:09
4        3     75314       1  1595468524  2020-07-23 07:12:04


In order to calculate the similarities, it is common to put values in a matrix. In this matrix, users are identified by each row, and items are represented by columns.

In [3]:
user_items = reviews[['user_id', 'movie_id', 'rating']]
user_items.head()

Unnamed: 0,user_id,movie_id,rating
0,1,114508,8
1,2,499549,9
2,2,1305591,8
3,2,1428538,1
4,3,75314,1


In [4]:
user_items.shape

(902957, 3)

In [5]:
user_by_movie = user_items[:700000].groupby(['user_id', 'movie_id'])['rating'].max().unstack()

In [6]:
user_by_movie

movie_id,8,25,91,417,439,443,628,833,1223,1740,...,14152756,14156926,14164234,14219522,14220316,14237412,14318270,14318430,14372240,14404280
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54735,,,,,,,,,,,...,,,,,,,,,,
54736,,,,,,,,,,,...,,,,,,,,,,
54737,,,,,,,,,,,...,,,,,,,,,,
54738,,,,,,,,,,,...,,,,,,,,,,


In [None]:
 # user_items.pivot_table(index=['user_id'],columns=['movie_id'],values='rating', aggfunc='max')

In [None]:
# def test_overflow(num_rows, num_columns):
#     num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
#     if num_rows > 0 and num_columns > 0 and num_cells <= 0:
#         print("Unstacked DataFrame is too big, causing int32 overflow (np.multiply)")
#     if num_rows > 0 and num_columns > 0 and num_rows * num_columns > 2**31 - 1:
#         print("Unstacked DataFrame is too big, causing int32 overflow (python)")

In [None]:
# test_overflow(4000000, 10000)

In [None]:
# from tqdm import tqdm

# chunk_size = 50000
# chunks = [x for x in range(0, user_items.shape[0], chunk_size)]
# chunks

In [None]:
# user_by_movie = pd.DataFrame()

# for i in tqdm(range(0, len(chunks)-1)):
#    chunk_df = user_items.iloc[chunks[i]:chunks[i+1]-1]
#    interactions = (chunk_df.groupby(['user_id', 'movie_id'])['rating'].max().unstack())
#    print(interactions.shape)
#    user_by_movie = user_by_movie.append(interactions, sort=False)

In [14]:
# Create a dictionary with users and corresponding movies seen

def movies_watched(user_id):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    OUTPUT:
    movies - an array of movies the user has watched
    '''
    movies = user_by_movie.loc[user_id][user_by_movie.loc[user_id].isnull() == False].index.values

    return movies

In [16]:
from tqdm import tqdm

def create_user_movie_dict():
    '''
    INPUT: None
    OUTPUT: movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    
    Creates the movies_seen dictionary
    '''
    n_users = user_by_movie.shape[0]
    movies_seen = dict()
    
    for user in tqdm(range(1, n_users+1)):
        movies_seen[user] = movies_watched(user)
    
    return movies_seen

movies_seen = create_user_movie_dict()

100%|██████████████████████████████████████████████████████████████████████████| 54739/54739 [00:42<00:00, 1294.39it/s]


In [22]:
def create_movies_to_analyze(movies_seen, lower_bound=2):
    '''
    INPUT:  
    movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    lower_bound - (an int) a user must have more movies seen than the lower bound to be added to the movies_to_analyze dictionary

    OUTPUT: 
    movies_to_analyze - a dictionary where each key is a user_id and the value is an array of movie_ids
    
    The movies_seen and movies_to_analyze dictionaries should be the same except that the output dictionary has removed 
    
    '''
    movies_to_analyze = dict()
    
    for user, movies in movies_seen.items():
        if len(movies) > lower_bound:
            movies_to_analyze[user] = movies
    return movies_to_analyze
    
movies_to_analyze = create_movies_to_analyze(movies_seen)    

In [36]:
def compute_correlation(user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    OUTPUT
    the correlation between the matching ratings between the two users
    '''
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    # Find Similar Movies
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)
    
    # Calculate Correlation between the users
    df = user_by_movie.loc[(user1, user2), sim_movs]
    corr = df.transpose().corr().iloc[0,1]
    
    return corr

In [41]:
# Which movies did both user 2 and user 4 see?
set_2 = set(movies_to_analyze[2])
set_104 = set(movies_to_analyze[66])
set_2.intersection(set_104)

set()