## Item-item collaborative filtering
With movielens dataset, we're gonna use pandas and numpy to make a Collaborative Filtering, to discover which movies are more related to "Star Wars (1977)".

In [1]:
# needed
import pandas as pd
import numpy as np

In [2]:
# reading files we're gonna need
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3))

m_cols = ['movie_id', 'title']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(2))

ratings = pd.merge(movies, ratings)

In [3]:
# the merged dataframe itself
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [4]:
# with pandas pivot table function, we transform every movie as a column, so then we can apply 
# the item-item method. Moreover, every row is an user and the values are the ratings.
movieRatings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
movieRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,� k�ldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [5]:
# extracting Star Wars ratings only.
starWarsRatings = movieRatings['Star Wars (1977)']
starWarsRatings.head()

user_id
1    5.0
2    5.0
3    NaN
4    5.0
5    4.0
Name: Star Wars (1977), dtype: float64

In [6]:
# Calculate correlaction between Star Wars and the rest, using pandas.corrwith()
similarMovies = movieRatings.corrwith(starWarsRatings)
similarMovies = similarMovies.dropna()
df = pd.DataFrame(similarMovies)
df.head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
'Til There Was You (1997),0.872872
1-900 (1994),-0.645497
101 Dalmatians (1996),0.211132
12 Angry Men (1957),0.184289
187 (1997),0.027398


In [7]:
# As seen below, this doesn't get us a satisfied result, because "peculiar" movies were included. 
# We'll need a threshold to get a better result.
similarMovies.sort_values(ascending=False)

title
Hollow Reed (1996)                        1.0
Commandments (1997)                       1.0
Cosi (1996)                               1.0
No Escape (1994)                          1.0
Stripes (1981)                            1.0
                                         ... 
Roseanna's Grave (For Roseanna) (1997)   -1.0
For Ever Mozart (1996)                   -1.0
American Dream (1990)                    -1.0
Frankie Starlight (1995)                 -1.0
Fille seule, La (A Single Girl) (1995)   -1.0
Length: 1410, dtype: float64

In [8]:
# Get all of the ratings and the mean of it from every movie.
movieStats = ratings.groupby('title')['rating'].agg([np.size, np.mean]) \
    .rename(columns={"size": "r_size", "mean": "r_mean"})
movieStats.r_mean = movieStats.r_mean.round(3)
movieStats.head()

Unnamed: 0_level_0,r_size,r_mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),9,2.333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908
12 Angry Men (1957),125,4.344
187 (1997),41,3.024


In [9]:
# Filter only movies with > 100 ratings.

#popularMovies = movieStats['r_size'] >= 100
#movieStats[popularMovies].sort_values(by='mean', ascending=False)
popularMovies = movieStats.loc[movieStats['r_size'] >= 100]
popularMovies.sort_values(by='r_mean', ascending=False).head(10)


Unnamed: 0_level_0,r_size,r_mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",112,4.491
Schindler's List (1993),298,4.466
"Wrong Trousers, The (1993)",118,4.466
Casablanca (1942),243,4.457
"Shawshank Redemption, The (1994)",283,4.445
Rear Window (1954),209,4.388
"Usual Suspects, The (1995)",267,4.386
Star Wars (1977),583,4.358
12 Angry Men (1957),125,4.344
Citizen Kane (1941),198,4.293


In [10]:
# Join the correlaction result with this new filtered dataframe (>100)
df = popularMovies.join(pd.DataFrame(similarMovies, columns=['similarity']))
df = df.reset_index()
df.head()

Unnamed: 0,title,r_size,r_mean,similarity
0,101 Dalmatians (1996),109,2.908,0.211132
1,12 Angry Men (1957),125,4.344,0.184289
2,2001: A Space Odyssey (1968),259,3.969,0.230884
3,Absolute Power (1997),127,3.37,0.08544
4,"Abyss, The (1989)",151,3.589,0.203709


In [11]:
# And finally we got a truer result. 
df.sort_values(by='similarity', ascending=False).head(10)

Unnamed: 0,title,r_size,r_mean,similarity
295,Star Wars (1977),583,4.358,1.0
99,"Empire Strikes Back, The (1980)",367,4.204,0.747981
255,Return of the Jedi (1983),507,4.008,0.672556
247,Raiders of the Lost Ark (1981),420,4.252,0.536117
24,Austin Powers: International Man of Mystery (1...,130,3.246,0.377433
298,"Sting, The (1973)",241,4.058,0.367538
162,Indiana Jones and the Last Crusade (1989),331,3.931,0.350107
235,Pinocchio (1940),101,3.673,0.347868
119,"Frighteners, The (1996)",115,3.235,0.332729
176,L.A. Confidential (1997),297,4.162,0.319065
