# Collaborative Filtering
- (1) ITEM-ITEM (with "USER-ITEM Matrix) 

    show in the detail information page, after user finish one movie...

- (2) USER-USER (with "ITEM-USER Matrix) 

    show in the home page...
    
    
- Reference:
    
    https://github.com/mc6666/AI_Applications
    
    https://ithelp.ithome.com.tw/articles/10219511
    
    
- pros and cons 
    - Memory Based Collaborative Filtering
    - too memory consuming 
    - => try Model Based CF. e.g. KNN or SVD

In [53]:
import pandas as pd 

links = pd.read_csv("ml-latest-small/links.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")
tags = pd.read_csv("ml-latest-small/tags.csv")

print(links.head(3))
print(movies.head(3))
print(ratings.head(3))
print(tags.head(3))

   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
   movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992


In [54]:
# merge movie and rating data
data = pd.merge(movies,ratings)
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


# ITEM-ITEM Collaborative Filtering Similarity 
# with "USER-ITEM Matrix"

In [55]:
# USER-ITEM Matrix
pivot_table = data.pivot_table(index = ["userId"], columns = ["title"], values = "rating")
pivot_table.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,1.0,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [56]:
pivot_table.shape

(610, 9719)

In [83]:
movie_watched = pivot_table["Bad Boys (1995)"]
movie_watched

userId
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
      ... 
606    NaN
607    NaN
608    3.5
609    NaN
610    NaN
Name: Bad Boys (1995), Length: 610, dtype: float64

In [77]:
# ITEM-ITEM Collaborative Filtering Similarity
# pick one movie. e.g. movie = "Bad Boys (1995)"
movie_watched = pivot_table["Bad Boys (1995)"]
similarity_with_other_movies = pivot_table.corrwith(movie_watched, axis=0)  # find correlation between "Bad Boys (1995)" and other movies
similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
similarity_with_other_movies.head()

title
Carnage (2011)                           1.0
Texas Chainsaw Massacre 2, The (1986)    1.0
Hills Have Eyes II, The (2007)           1.0
Mr. 3000 (2004)                          1.0
Blue Jasmine (2013)                      1.0
dtype: float64

# USER-USER Collaborative Filtering Similarity
# with "ITEM-USER Matrix"

In [58]:
# lets make a pivot table in order to make rows are users and columns are movies. And values are rating
uu_pivot_table = data.pivot_table(index =["title"], columns = ["userId"], values = "rating")
uu_pivot_table.head(10)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
'Tis the Season for Love (2015),,,,,,,,,,,...,,,,,,,,,,
"'burbs, The (1989)",,,,,,,,,,,...,,,,,,,,,,
'night Mother (1986),,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer (2009),,,,,,,,,,,...,,,,,,,,,,3.5
*batteries not included (1987),,,,,,,,,,,...,,,,,,,,,,


In [60]:
uu_pivot_table.shape

(9719, 610)

In [81]:
target_user = uu_pivot_table[10]
target_user

title
'71 (2014)                                  NaN
'Hellboy': The Seeds of Creation (2004)     NaN
'Round Midnight (1986)                      NaN
'Salem's Lot (2004)                         NaN
'Til There Was You (1997)                   NaN
                                             ..
eXistenZ (1999)                             NaN
xXx (2002)                                  NaN
xXx: State of the Union (2005)              NaN
¡Three Amigos! (1986)                       NaN
À nous la liberté (Freedom for Us) (1931)   NaN
Name: 10, Length: 9719, dtype: float64

In [82]:
# pick one user. e.g. user = 10 
target_user = uu_pivot_table[10]
uu_similarity_with_other_movies = uu_pivot_table.corrwith(target_user,axis=0)  # find correlation between "Bad Boys (1995)" and other movies
uu_similarity_with_other_movies = uu_similarity_with_other_movies.sort_values(ascending=False)
uu_similarity_with_other_movies.head()

userId
574    1.0
146    1.0
10     1.0
225    1.0
136    1.0
dtype: float64

In [73]:
# top 10 user
uu_similarity_with_other_movies.index[0:10]

Int64Index([574, 146, 10, 225, 136, 568, 315, 56, 162, 179], dtype='int64', name='userId')

# Small Demo for Testing Corr

In [27]:
# small demo test 
# https://blog.csdn.net/w1301100424/article/details/98473560

# df.corr(axis,method):
# axis::The axis to use. 0 or ‘index’ to compute column-wise, 1 or ‘columns’ for row-wise.
# method:: default = "pearson"

    
import pandas as pd
import numpy as np


data = np.array([[5, 5, 3, 3, 4], [3, 4, 5, 5, 4],
                 [3, 4, 3, 4, 5], [5, 5, 3, 4, 4]])

df = pd.DataFrame(data, 
                  columns=['The Shawshank Redemption', 'Forrest Gump', 'Avengers: Endgame','Iron Man', 'Titanic '],
                  index=['user1', 'user2', 'user3', 'user4'])
df

Unnamed: 0,The Shawshank Redemption,Forrest Gump,Avengers: Endgame,Iron Man,Titanic
user1,5,5,3,3,4
user2,3,4,5,5,4
user3,3,4,3,4,5
user4,5,5,3,4,4


In [33]:
# Compute correlation between user1 and other users
user_to_compare = df.iloc[0]
user_to_compare

The Shawshank Redemption    5
Forrest Gump                5
Avengers: Endgame           3
Iron Man                    3
Titanic                     4
Name: user1, dtype: int64

In [46]:
similarity_with_other_users = df.corrwith(user_to_compare, axis=1)
similarity_with_other_users

user1    1.000000
user2   -0.896421
user3    0.000000
user4    0.896421
dtype: float64

In [44]:
similarity_with_other_users = similarity_with_other_users.sort_values(ascending=False)
similarity_with_other_users

user1    1.000000
user4    0.896421
user3    0.000000
user2   -0.896421
dtype: float64

In [47]:
# Compute correlation between 'The Shawshank Redemption' and other movies
movie_to_compare = df['The Shawshank Redemption']
similarity_with_other_movies = df.corrwith(movie_to_compare, axis=0)
similarity_with_other_movies

The Shawshank Redemption    1.000000
Forrest Gump                1.000000
Avengers: Endgame          -0.577350
Iron Man                   -0.707107
Titanic                    -0.577350
dtype: float64

In [48]:
similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
similarity_with_other_movies

Forrest Gump                1.000000
The Shawshank Redemption    1.000000
Titanic                    -0.577350
Avengers: Endgame          -0.577350
Iron Man                   -0.707107
dtype: float64