# Model-Based Collaborative Filtering System

### This model will try to find the most correlated movie for each movie by generalization of user ratins


In [54]:
import pandas as pd
import numpy as np
import os
from sklearn.decomposition import TruncatedSVD

base_path = "datasets\\MovieLens-100k"


In [55]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
frame = pd.read_csv(os.path.join(base_path, 'u.data'), sep='\t', names=columns)
frame.head()
# The dataset contains the users, and the rating for a moview that reviewed


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [56]:
columns = ['item_id', 'movie_title', 'release_data', 'video_release_date', 'IMDB_url', 'unkown', 'action', 'adventure', 'animation', 'childrens', 'comdey',
           'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']
movies = pd.read_csv(os.path.join(base_path, 'u.item'),
                     sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie_title']]
movie_names.head()


Unnamed: 0,item_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [57]:
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()


Unnamed: 0,user_id,item_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [58]:
# Sorting movies by ther rating count
combined_movies_data.groupby(
    'item_id')['rating'].count().sort_values(ascending=False).head()


item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [59]:
# Checking what is the name of the most popular movie
Filter = combined_movies_data['item_id'] == 50
combined_movies_data[Filter]['movie_title'].unique()


array(['Star Wars (1977)'], dtype=object)

In [60]:
# Building Utility Matrix
rating_crosstab = combined_movies_data.pivot_table(
    values='rating', index='user_id', columns='movie_title', fill_value=0)
# fill_value - how to deal with null values
rating_crosstab.head()
# we got a utility matrix


movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [61]:
rating_crosstab.shape


(943, 1664)

In [62]:
# Sklearn compresses the dataset along its COLUMNS so we need to transpose it.
# we have 1664 movie names and 943 users.
# we want to preserve the movie names as uncompressed, and want to compress the users. hence, we need to transpose the matrix
# After the compression, we will have a generalized view of the users' taste
x = rating_crosstab.values.T
# We chose to compress the users' data to 12 from 943. random_state means we will get the same results each time
svd = TruncatedSVD(n_components=12, random_state=17)
result_matrix = svd.fit_transform(x)
result_matrix.shape


(1664, 12)

In [63]:
# We calculate the correlation matrix to find out movies that are correlated with users' ratings
corr_mat = np.corrcoef(result_matrix)
corr_mat.shape


(1664, 1664)

In [64]:
# Isolate startwars from the matrix
movie_names = rating_crosstab.columns
movies_list = list(movie_names)
star_wars = movies_list.index('Star Wars (1977)')
star_wars


1398

In [65]:
# find the pearson r correlation for each movie with star wars
corr_star_wars = corr_mat[star_wars]
corr_star_wars.shape


(1664,)

In [66]:
# find all the movies with r value that is not one (because this is star wars) but higher than 0.9
movie_names[(corr_star_wars < 1.0) & (corr_star_wars > 0.9)]


Index(['Die Hard (1988)', 'Empire Strikes Back, The (1980)',
       'Fugitive, The (1993)', 'Raiders of the Lost Ark (1981)',
       'Return of the Jedi (1983)', 'Star Wars (1977)',
       'Terminator 2: Judgment Day (1991)', 'Terminator, The (1984)',
       'Toy Story (1995)'],
      dtype='object', name='movie_title')

In [67]:
list(movie_names[(corr_star_wars < 1.0) & (corr_star_wars > 0.95)])
# for some reason, star wars was still selected, but I get the idea


['Return of the Jedi (1983)', 'Star Wars (1977)']