# Import Libraries

In [1]:
import numpy as np
import pandas as pd

In [15]:
#load our dataset
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('u.data', sep = '\t', names = r_cols, usecols = range(3))

m_cols = ['movie_id', 'title']
movies = pd.read_csv('u.item', sep = '|', names = m_cols, usecols = range(2), encoding='latin-1')

ratings = pd.merge(movies, ratings)

In [16]:
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [23]:
#transforms our ratings table to a new table that contains movie titles as columns for every user_id with their values
movieRatings = ratings.pivot_table(index = ['user_id'], columns = ['title'], values = 'rating')

In [27]:
movieRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [33]:
#we see here the users that watched 12 angry men 1957
angryMen = movieRatings['12 Angry Men (1957)']
angryMen.head()

user_id
1    5.0
2    NaN
3    NaN
4    NaN
5    NaN
Name: 12 Angry Men (1957), dtype: float64

In [34]:
#find the correlations between all the movies in the entire dataset with angry men
similarMovies = movieRatings.corrwith(angryMen)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [35]:
similarMovies

title
'Til There Was You (1997)               -0.500000
1-900 (1994)                                  NaN
101 Dalmatians (1996)                   -0.049890
12 Angry Men (1957)                      1.000000
187 (1997)                               0.666667
                                           ...   
Young Guns II (1990)                    -0.361961
Young Poisoner's Handbook, The (1995)    0.144338
Zeus and Roxanne (1997)                  1.000000
unknown                                  1.000000
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64

In [37]:
#drop missing values
similarMovies = similarMovies.dropna()

In [41]:
#put our results in a dataframe
df = pd.DataFrame(similarMovies)

In [42]:
df

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
'Til There Was You (1997),-0.500000
101 Dalmatians (1996),-0.049890
12 Angry Men (1957),1.000000
187 (1997),0.666667
2 Days in the Valley (1996),0.256625
...,...
Young Guns (1988),0.068944
Young Guns II (1990),-0.361961
"Young Poisoner's Handbook, The (1995)",0.144338
Zeus and Roxanne (1997),1.000000


In [45]:
#movies similar to 12 angry men
similarMovies.sort_values(ascending = False)

title
Spirits of the Dead (Tre passi nel delirio) (1968)    1.0
Jefferson in Paris (1995)                             1.0
Double Happiness (1994)                               1.0
Nina Takes a Lover (1994)                             1.0
Dream With the Fishes (1997)                          1.0
                                                     ... 
Bio-Dome (1996)                                      -1.0
House Arrest (1996)                                  -1.0
Hellraiser: Bloodline (1996)                         -1.0
For Richer or Poorer (1997)                          -1.0
Heaven's Prisoners (1996)                            -1.0
Length: 1188, dtype: float64

In [48]:
#check the statistics of our our original ratings data frame
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [51]:
#lets get rid of movies that were rated by 100 less people and sort by rating
popularMovies = movieStats['rating']['size'] >= 100
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending = False)[:15]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.45679
"Shawshank Redemption, The (1994)",283,4.44523
Rear Window (1954),209,4.38756
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),583,4.358491
12 Angry Men (1957),125,4.344
Citizen Kane (1941),198,4.292929


In [52]:
#lets go ahead and make our new dataframe of movies similar to angry men based on this new dataframe
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns = ['similarity']))



In [53]:
df.head()

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101 Dalmatians (1996),109,2.908257,-0.04989
12 Angry Men (1957),125,4.344,1.0
2001: A Space Odyssey (1968),259,3.969112,0.178848
Absolute Power (1997),127,3.370079,-0.163455
"Abyss, The (1989)",151,3.589404,0.045374


In [55]:
df.sort_values(['similarity'], ascending = False)[:15]

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12 Angry Men (1957),125,4.344,1.0
Ulee's Gold (1997),184,3.76087,0.619544
Rear Window (1954),209,4.38756,0.570513
Seven Years in Tibet (1997),155,3.458065,0.549939
Clerks (1994),148,3.804054,0.528173
Singin' in the Rain (1952),137,3.992701,0.520844
Vertigo (1958),179,4.251397,0.50946
Army of Darkness (1993),116,3.431034,0.477251
Mr. Smith Goes to Washington (1939),105,4.12381,0.470078
"First Wives Club, The (1996)",160,3.01875,0.466773


# Item Based Collaborative Filtering

In [59]:
#now lets build a full blown movies recommender system using item based collaborative filtering
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [60]:
movieRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [66]:
userRatings = movieRatings

In [67]:
#find the correlations between all the movies in the entire dataset with any movie pair
corrMatrix = userRatings.corr()

In [65]:
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,,-1.0,-0.5,-0.5,0.522233,,-0.426401,,,...,,,,,,,,,,
1-900 (1994),,1.0,,,,,,-0.981981,,,...,,,,-0.944911,,,,,,
101 Dalmatians (1996),-1.0,,1.0,-0.04989,0.269191,0.048973,0.266928,-0.043407,,0.111111,...,,-1.0,,0.15884,0.119234,0.680414,0.0,0.707107,,
12 Angry Men (1957),-0.5,,-0.04989,1.0,0.666667,0.256625,0.274772,0.178848,,0.457176,...,,,,0.096546,0.068944,-0.361961,0.144338,1.0,1.0,
187 (1997),-0.5,,0.269191,0.666667,1.0,0.596644,,-0.5547,,1.0,...,,0.866025,,0.455233,-0.5,0.5,0.475327,,,


In [70]:
#lets remove movie similarities that less than 100 people rated both of those movies
corrMatrix = userRatings.corr(method = 'pearson', min_periods = 100)

In [71]:
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,,1.0,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,,,1.0,,,,,,,...,,,,,,,,,,
187 (1997),,,,,,,,,,,...,,,,,,,,,,


In [75]:
#Now lets create a fake person to create reommendations for
myRatings = userRatings.iloc[0].dropna()

In [76]:
myRatings

title
101 Dalmatians (1996)                  2.0
12 Angry Men (1957)                    5.0
20,000 Leagues Under the Sea (1954)    3.0
2001: A Space Odyssey (1968)           4.0
Abyss, The (1989)                      3.0
                                      ... 
Wizard of Oz, The (1939)               4.0
Wrong Trousers, The (1993)             5.0
Young Frankenstein (1974)              5.0
Young Guns (1988)                      3.0
unknown                                4.0
Name: 1, Length: 271, dtype: float64

In [86]:
#lets go through every movie this person rated
simCandidates = pd.Series()
for i in range(0, len(myRatings.index)):
    print('Adding sims for ' + myRatings.index[i] + '...')
    #retrieve similar movies to this one that i rated
    sims = corrMatrix[myRatings.index[i]].dropna()
    #now scale its similarity by how well the user rated this movie and gives strenght to movies similar to the ones the user liked and less strenght to movies similar to the ones the user did not like
    sims = sims.map(lambda x: x * myRatings[i])
    #add the score to the list of similarity candidates
    simCandidates = simCandidates.append(sims)

#glance at our results so far:
print('sorting...')
simCandidates.sort_values(inplace = True, ascending = False)
print(simCandidates)

  


Adding sims for 101 Dalmatians (1996)...
Adding sims for 12 Angry Men (1957)...
Adding sims for 20,000 Leagues Under the Sea (1954)...
Adding sims for 2001: A Space Odyssey (1968)...
Adding sims for Abyss, The (1989)...
Adding sims for Ace Ventura: Pet Detective (1994)...
Adding sims for Air Bud (1997)...
Adding sims for Akira (1988)...
Adding sims for Aladdin (1992)...
Adding sims for Alien (1979)...
Adding sims for Aliens (1986)...
Adding sims for All Dogs Go to Heaven 2 (1996)...
Adding sims for Amadeus (1984)...
Adding sims for Angels and Insects (1995)...
Adding sims for Antonia's Line (1995)...
Adding sims for Apocalypse Now (1979)...
Adding sims for Apollo 13 (1995)...
Adding sims for Aristocats, The (1970)...
Adding sims for Army of Darkness (1993)...
Adding sims for Austin Powers: International Man of Mystery (1997)...
Adding sims for Babe (1995)...
Adding sims for Back to the Future (1985)...
Adding sims for Bad Boys (1995)...
Adding sims for Basic Instinct (1992)...
Adding s

Adding sims for What's Eating Gilbert Grape (1993)...
Adding sims for When Harry Met Sally... (1989)...
Adding sims for When the Cats Away (Chacun cherche son chat) (1996)...
Adding sims for While You Were Sleeping (1995)...
Adding sims for White Balloon, The (1995)...
Adding sims for Willy Wonka and the Chocolate Factory (1971)...
Adding sims for Wizard of Oz, The (1939)...
Adding sims for Wrong Trousers, The (1993)...
Adding sims for Young Frankenstein (1974)...
Adding sims for Young Guns (1988)...
Adding sims for unknown...
sorting...
Young Frankenstein (1974)                 5.000000
Monty Python and the Holy Grail (1974)    5.000000
Mars Attacks! (1996)                      5.000000
Lone Star (1996)                          5.000000
Kolya (1996)                              5.000000
                                            ...   
First Wives Club, The (1996)             -0.972480
Clockwork Orange, A (1971)               -1.060942
Annie Hall (1977)                        -1.0901

In [82]:
#we are getting some duplicate values because if we have movies that are similar to more than
#one movie the user rated it will come back more than one
#so we add the duplicates together these duplicates to have a higher correlation score
#simCandidates = simCandidates.groupby(simCandidates.index).sum()

In [91]:
simCandidates.sort_values(inplace = True, ascending = False)
simCandidates.head(100)

Young Frankenstein (1974)         5.0
Professional, The (1994)          5.0
Terminator, The (1984)            5.0
Princess Bride, The (1987)        5.0
When Harry Met Sally... (1989)    5.0
                                 ... 
Raging Bull (1980)                4.0
Quiz Show (1994)                  4.0
Pulp Fiction (1994)               4.0
Reservoir Dogs (1992)             4.0
Firm, The (1993)                  4.0
Length: 100, dtype: float64

In [93]:
#there you have it!