In [1]:
import pandas as pd
import numpy as np

In [2]:
# Dataset: http://guidetodatamining.com/chapter2/
df = pd.read_csv("Movie_Ratings.csv", encoding="cp1252")
df.head()

Unnamed: 0.1,Unnamed: 0,Patrick C,Heather,Bryan,Patrick T,Thomas,aaron,vanessa,greg,brian,...,Zak,Matt,Chris.1,Josh,Amy,Valerie,Gary,Stephen,Jessica,Jeff
0,Alien,,,2.0,,5.0,4.0,,,4.0,...,,,4.0,3.0,,,2.0,5.0,,4.0
1,Avatar,4.0,5.0,5.0,4.0,2.0,,4.0,3.0,,...,5.0,,,4.0,3.0,2.0,1.0,4.0,,4.0
2,Blade Runner,5.0,,,,5.0,4.0,,1.0,5.0,...,,,3.0,,3.0,3.0,1.0,,,5.0
3,Braveheart,4.0,,5.0,,4.0,4.0,3.0,4.0,4.0,...,5.0,,4.0,,3.0,4.0,5.0,5.0,,4.0
4,Dodgeball,5.0,4.0,3.0,2.0,4.0,,4.0,5.0,3.0,...,3.0,,3.0,,4.0,3.0,4.0,3.0,,3.0


In [3]:
# rename the first column
df = df.rename(columns={'Unnamed: 0': 'Movies'})
df.head(5)

Unnamed: 0,Movies,Patrick C,Heather,Bryan,Patrick T,Thomas,aaron,vanessa,greg,brian,...,Zak,Matt,Chris.1,Josh,Amy,Valerie,Gary,Stephen,Jessica,Jeff
0,Alien,,,2.0,,5.0,4.0,,,4.0,...,,,4.0,3.0,,,2.0,5.0,,4.0
1,Avatar,4.0,5.0,5.0,4.0,2.0,,4.0,3.0,,...,5.0,,,4.0,3.0,2.0,1.0,4.0,,4.0
2,Blade Runner,5.0,,,,5.0,4.0,,1.0,5.0,...,,,3.0,,3.0,3.0,1.0,,,5.0
3,Braveheart,4.0,,5.0,,4.0,4.0,3.0,4.0,4.0,...,5.0,,4.0,,3.0,4.0,5.0,5.0,,4.0
4,Dodgeball,5.0,4.0,3.0,2.0,4.0,,4.0,5.0,3.0,...,3.0,,3.0,,4.0,3.0,4.0,3.0,,3.0


In [4]:
# see no of movies, users
# 25 movies, 26 users
len(df['Movies']), len(df.columns)

(25, 26)

In [5]:
# sort by user who rated more no of movies
user_rated_count = {}
for col in df.columns[1:]:
    user_rated_count[col] = len(df[col].dropna())
freq_user = sorted(user_rated_count, key=user_rated_count.get, reverse=True)
for usr in freq_user:
    print usr, user_rated_count[usr]

Chris 24
brian 23
Valerie 23
Bryan 22
Zwe 22
Thomas 22
Jeff 22
Chris.1 22
Gary 21
ben 20
Zak 19
Patrick C 18
Stephen 18
Jonathan 18
greg 18
Amy 17
Patrick T 15
Heather 15
vanessa 15
aaron 14
Jessica 12
Katherine 12
Erin 12
Josh 11
Matt 6


In [6]:
# see the movies which has been rated by all users
# Toy Story is the only movie which has been rated by all users
len(df), len(df.dropna()), df.dropna()

(25,
 1,
        Movies  Patrick C  Heather  Bryan  Patrick T  Thomas  aaron  vanessa  \
 22  Toy Story        4.0      3.0    3.0        4.0     4.0    5.0      4.0   
 
     greg  brian  ...   Zak  Matt  Chris.1  Josh  Amy  Valerie  Gary  Stephen  \
 22   5.0    5.0  ...   4.0   4.0      4.0   4.0  4.0      5.0   4.0      4.0   
 
     Jessica  Jeff  
 22      5.0   5.0  
 
 [1 rows x 26 columns])

In [7]:
def correlation_methods(data, m_name, weightage):
    
    # find correlation by suggested method
    # mask the lower traingle matrix to nan, since these are repeated
    data_corr = data.corr(method=m_name)
    data_corr = data_corr.mask(np.tril(np.ones(data_corr.shape)).astype(np.bool))
    
    # resize them to two users, get all similarity score above weightage and sort
    data_corr = data_corr[data_corr >= weightage].stack().reset_index()
    rename_columns = {'level_0': 'user_1', 'level_1': 'user_2', 0: 'similarity_score'}
    data_corr = data_corr.rename(columns=rename_columns)
    return data_corr.sort_values('similarity_score', ascending=False).reset_index(drop=True)

In [8]:
correlation_methods(df, 'pearson', 0.5).head(20)

Unnamed: 0,user_1,user_2,similarity_score
0,ben,Matt,1.0
1,greg,Matt,0.9759
2,aaron,Erin,0.942881
3,vanessa,Matt,0.942809
4,Amy,Jessica,0.923077
5,Jonathan,Amy,0.870879
6,Katherine,Jessica,0.866537
7,Matt,Jeff,0.821429
8,Zwe,Matt,0.821429
9,Matt,Amy,0.816497


In [9]:
correlation_methods(df, 'kendall', 0.5).head(20)

Unnamed: 0,user_1,user_2,similarity_score
0,ben,Matt,1.0
1,greg,Matt,1.0
2,aaron,Erin,0.914659
3,Amy,Jessica,0.893198
4,Matt,Amy,0.875
5,Jonathan,Amy,0.830199
6,Matt,Jeff,0.800641
7,Zwe,Matt,0.800641
8,Katherine,Jessica,0.797081
9,vanessa,Matt,0.774597


In [10]:
correlation_methods(df, 'spearman', 0.5).head(20)

Unnamed: 0,user_1,user_2,similarity_score
0,greg,Matt,1.0
1,ben,Matt,1.0
2,aaron,Erin,0.942482
3,Amy,Jessica,0.941876
4,Matt,Amy,0.916667
5,Jonathan,Amy,0.886298
6,Katherine,Jessica,0.862156
7,Zwe,Matt,0.86164
8,Matt,Jeff,0.86164
9,vanessa,Matt,0.816497


In [11]:
# since Bryan & Zak are similar to each other, which we know through the correlation scores,
# now let's recommend Zak some movies to watch and rate from the list of Bryan
df_recommend = df[['Movies', 'Bryan', 'Zak']]
df_recommend = df_recommend.pivot_table(index='Movies')
df_recommend = df_recommend[df_recommend['Zak'].isnull()]
df_recommend

Unnamed: 0_level_0,Bryan,Zak
Movies,Unnamed: 1_level_1,Unnamed: 2_level_1
Alien,2.0,
Blade Runner,,
Jaws,4.0,
Pootie Tang,1.0,
Shawshank Redemption,5.0,
Snakes on a Plane,2.0,


In [12]:
# Zak can be made a suggestion of watching Jaws, Shawshank Redemption (selected these 2 movies, since they have good ratings. 
# Other movies Alien, Pootie Tang, Snakes on a Plane can also be suggested)