In [None]:
import pandas as pd
import numpy as np

In [None]:
# Dataset: http://guidetodatamining.com/chapter2/
df = pd.read_csv("Movie_Ratings.csv", encoding="cp1252")
df.head()

In [None]:
# rename the first column
df = df.rename(columns={'Unnamed: 0': 'Movies'})
df.head(5)

In [None]:
# see no of movies, users
# 25 movies, 26 users
len(df['Movies']), len(df.columns)

In [None]:
# sort by user who rated more no of movies
user_rated_count = {}
for col in df.columns[1:]:
    user_rated_count[col] = len(df[col].dropna())
freq_user = sorted(user_rated_count, key=user_rated_count.get, reverse=True)
for usr in freq_user:
    print usr, user_rated_count[usr]

In [None]:
# see the movies which has been rated by all users
# Toy Story is the only movie which has been rated by all users
len(df), len(df.dropna()), df.dropna()

In [None]:
def correlation_methods(data, m_name, weightage):
    
    # find correlation by suggested method
    # mask the lower traingle matrix to nan, since these are repeated
    data_corr = data.corr(method=m_name)
    data_corr = data_corr.mask(np.tril(np.ones(data_corr.shape)).astype(np.bool))
    
    # resize them to two users, get all similarity score above weightage and sort
    data_corr = data_corr[data_corr >= weightage].stack().reset_index()
    rename_columns = {'level_0': 'user_1', 'level_1': 'user_2', 0: 'similarity_score'}
    data_corr = data_corr.rename(columns=rename_columns)
    return data_corr.sort_values('similarity_score', ascending=False).reset_index(drop=True)

In [None]:
correlation_methods(df, 'pearson', 0.5).head(20)

In [None]:
correlation_methods(df, 'kendall', 0.5).head(20)

In [None]:
correlation_methods(df, 'spearman', 0.5).head(20)

In [None]:
# since Bryan & Zak are similar to each other, which we know through the correlation scores,
# now let's recommend Zak some movies to watch and rate from the list of Bryan
df_recommend = df[['Movies', 'Bryan', 'Zak']]
df_recommend = df_recommend.pivot_table(index='Movies')
df_recommend = df_recommend[df_recommend['Zak'].isnull()]
df_recommend

In [None]:
# Zak can be made a suggestion of watching Jaws, Shawshank Redemption (selected these 2 movies, since they have good ratings. 
# Other movies Alien, Pootie Tang, Snakes on a Plane can also be suggested)