# Collaborative Recommander

With the collaborative-based filtering we will (based on one movie choice) recommends movies by finding similarities between the ratings from others that has rated the movie

In [22]:
import modules.recommmender.CollaborativeRecommender as cr
from sklearn.neighbors import NearestNeighbors

#### Getting data
We are going to use data about movies and rating. We load our datasets from our folder "data".

In [23]:
movie_df = cr.load_movies()
movie_df.head()

Unnamed: 0,id,title
0,862.0,Toy Story
1,8844.0,Jumanji
2,15602.0,Grumpier Old Men
3,31357.0,Waiting to Exhale
4,11862.0,Father of the Bride Part II


In [24]:
movie_df.shape

(45466, 2)

In [25]:
# todo: try making a bar plot of the film rating, and other visual plots ...
rating_df = cr.load_rating()
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [26]:
rating_df.shape

(100004, 3)

#### Preparing
We will now clean and combine the two dataframes into one

In [27]:
# drops NaN values, rename id name to movieId to match rating_df id column
movie_df = cr.prepare_movie_data(movie_df)
movie_df.head()

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [28]:
movie_df.shape

(45460, 2)

In [29]:
# combine movie_df and rating_df into one dataframe
df = cr.merge_movie_and_rating(rating_df, movie_df)
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1371,2.5,Rocky III
1,4,1371,4.0,Rocky III
2,7,1371,3.0,Rocky III
3,19,1371,4.0,Rocky III
4,21,1371,3.0,Rocky III


In [30]:
df.shape

(44994, 4)

In [31]:
movie_rating_count = cr.count_ratings(df)
movie_rating_count.head()

Unnamed: 0,title,totalRatingCount
0,!Women Art Revolution,2
1,'Gator Bait,1
2,'Twas the Night Before Christmas,2
3,...And God Created Woman,1
4,00 Schneider - Jagd auf Nihil Baxter,2


In [32]:
movie_rating_count.shape

(2794, 2)

In [33]:
rating_with_total_rating_count = df.merge(movie_rating_count, left_on='title', right_on='title', how='left')
rating_with_total_rating_count

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1371,2.5,Rocky III,47
1,4,1371,4.0,Rocky III,47
2,7,1371,3.0,Rocky III,47
3,19,1371,4.0,Rocky III,47
4,21,1371,3.0,Rocky III,47
...,...,...,...,...,...
44989,652,129009,4.0,Love Is a Ball,1
44990,653,2103,3.0,Solaris,305
44991,659,167,4.0,K-PAX,1
44992,659,563,3.0,Starship Troopers,1


In [34]:
rating_with_total_rating_count.shape

(44994, 5)

In [35]:
rating_popular_movie = cr.get_popular_movies(rating_with_total_rating_count)
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1371,2.5,Rocky III,47
1,4,1371,4.0,Rocky III,47
2,7,1371,3.0,Rocky III,47
3,19,1371,4.0,Rocky III,47
4,21,1371,3.0,Rocky III,47


In [36]:
rating_popular_movie.shape

(34932, 5)

In [37]:
# todo: try making a bar plot of the film rating, and other visual plots ...
df = cr.prepare_data()
movie_features_df = cr.create_rating_metrix(df)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"20,000 Leagues Under the Sea",0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
24 Hour Party People,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Days Later,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Weeks Later,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
movie_features_df.shape

(599, 671)

#### Modeling

In [None]:
# choose movie and size of recommendation
movie_index = 1
recommends_size = 5

In [39]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_features_df)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[movie_index, :].values.reshape(1, -1),n_neighbors=recommends_size+1)

In [40]:
distances

array([[0.        , 0.5386512 , 0.54027957, 0.5432961 , 0.57435095,
        0.5752312 ]], dtype=float32)

In [41]:
indices

array([[  1, 264, 152,  50, 509, 457]], dtype=int64)

#### Result

In [42]:
cr.recommend_movies(movie_index, recommends_size)

['Lolita',
 'Donnie Darko',
 'Arlington Road',
 'The Passion of Joan of Arc',
 'The Dark']