In [1]:
import numpy as np
import pandas as pd
from math import sqrt
import operator


In [2]:
ratings = pd.read_csv('./ratings.csv')
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [3]:
movies = pd.read_csv('./movies.csv')
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
tags = pd.read_csv('./tags.csv')
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [5]:
movies.shape

(9742, 3)

Let's create a matrix where rows are user IDs, and Columns are Movie IDs


In [6]:
users_ratings_df = pd.DataFrame(columns=movies.movieId)
users_ratings_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609


In [7]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [8]:
movies_ratings = pd.merge(movies, ratings)
movies_ratings.head(10)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.5,1455209816
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0,965705637
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21,3.5,1407618878
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27,3.0,962685262
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,5.0,850466616


In [9]:
# Getting the rating given by a user to a movie.
def get_rating_(userid,movieid):
    try:
        return (ratings.loc[(ratings.userId==userid) & (ratings.movieId == movieid),'rating'].iloc[0])
    except:
        return np.NaN
    
def get_movieids_(userid):
    return (ratings.loc[(ratings.userId==userid), 'movieId'].tolist())

# Getting the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movieId == movieid), 'title'].iloc[0])

In [10]:
get_rating_(1, 5)

nan

In [11]:
get_movie_title_(3)

'Grumpier Old Men (1995)'

In [12]:
def distance_similarity_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity        score is to be calculated.
    '''
    # Count of movies watched by both the users.
    both_watch_count = 0
    for element in ratings.loc[ratings.userId==user1,'movieId'].tolist():
        if element in ratings.loc[ratings.userId==user2,'movieId'].tolist():
            both_watch_count += 1
    if both_watch_count == 0 :
        return 0
    
    # Calculating distance based similarity between both the users.
    distance = []
    for element in ratings.loc[ratings.userId==user1,'movieId'].tolist():
        if element in ratings.loc[ratings.userId==user2,'movieId'].tolist():
            rating1 = get_rating_(user1,element)
            rating2 = get_rating_(user2,element)
            distance.append(pow(rating1 - rating2, 2))
    total_distance = sum(distance)
    
    # Adding one to the denominator to avoid divide by zero error.
    return 1/(1+sqrt(total_distance))

In [13]:
print('Distance based similarity between user ids 1 & 310: {}'.format(distance_similarity_score(1,310)))

Distance based similarity between user ids 1 & 310: 0.2025519956555797


In [14]:
def pearson_correlation_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    # A list of movies watched by both the users.
    both_watch_count = []
    
    # Finding movies watched by both the users.
    for element in ratings.loc[ratings.userId==user1,'movieId'].tolist():
        if element in ratings.loc[ratings.userId==user2,'movieId'].tolist():
            both_watch_count.append(element)
    
    # Returning '0' correlation for bo common movies.
    if len(both_watch_count) == 0 :
        return 0
    
    # Calculating Co-Variances.
    rating_sum_1 = sum([get_rating_(user1,element) for element in both_watch_count])
    rating_sum_2 = sum([get_rating_(user2,element) for element in both_watch_count])
    rating_squared_sum_1 = sum([pow(get_rating_(user1,element),2) for element in both_watch_count])
    rating_squared_sum_2 = sum([pow(get_rating_(user2,element),2) for element in both_watch_count])
    product_sum_rating = sum([get_rating_(user1,element) * get_rating_(user2,element) for element in both_watch_count])
    
    # Returning pearson correlation between both the users.
    numerator = product_sum_rating - ((rating_sum_1 * rating_sum_2) / len(both_watch_count))
    denominator = sqrt((rating_squared_sum_1 - pow(rating_sum_1,2) / len(both_watch_count)) * (rating_squared_sum_2 - pow(rating_sum_2,2) / len(both_watch_count)))
    
    # Handling 'Divide by Zero' error.
    if denominator == 0:
        return 0
    return numerator/denominator

In [15]:
pearson_correlation_score(1, 2)

0

In [16]:
def get_all_user_ids():
    return ratings.userId.unique()

# 1. Find users most similar to given user

def find_most_similar_users(target_user_id, limit = 10):
    '''Finds the users most similar to target user'''
    
    coeffiecients = {}
    
    for uid in get_all_user_ids()[:10]:
        if uid == target_user_id:
            continue

        score = pearson_correlation_score(target_user_id, uid)
        
        coeffiecients[uid] = score
        
    # Sort coefficients and return first top X
    sorted_coeff = sorted(coeffiecients.items(), key=operator.itemgetter(1), reverse=True)
    
    # Convert to numpy array and return only first X similar users
    np_arr = np.array(sorted_coeff)
    
    #Return only UIDs who are similar to given user
    return np_arr[:limit, 0]

In [17]:
def recommend_movie(user_id):
    most_similar_users = find_most_similar_users(user_id)
    
    all_movies = []
    
    for uid in most_similar_users:
        all_movies.extend(get_movieids_(uid))
        
    # Movies watched by User ID
    watched_by_target_user = get_movieids_(user_id)
    
    return np.array(list(set(all_movies) - set(watched_by_target_user)))

In [18]:
recommended_movies = recommend_movie(1)

In [19]:
for recommended_movie in recommended_movies[0:10]:
    print(get_movie_title_(recommended_movie))

Jumanji (1995)
Mad Max: Fury Road (2015)
Waiting to Exhale (1995)
Father of the Bride Part II (1995)
Sabrina (1995)
Tom and Huck (1995)
GoldenEye (1995)
American President, The (1995)
How to Lose a Guy in 10 Days (2003)
Balto (1995)


In [20]:
# Lets split data into train and test, then lets try and build models for each user and see how that works out
# We will do this on the user with ID 1
# So what I want to do is to build a matrix out of what user prefers. Columns should be TAGs from movie, and values
# will be 1 or 0 (whether he has a preference for that or not)
# What this thing is called is "Content-based recommender system"

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
train, test = train_test_split(ratings.loc[ratings.userId == 1])

In [23]:
train.shape

(174, 4)

In [24]:
test.shape

(58, 4)

In [25]:
pd.concat([movies['movieId']], movies.genres.str.split("|", expand=True), axis=1)

TypeError: concat() got multiple values for argument 'axis'

In [None]:
pd.concat([movies[['movieId']], movies.genres.str.split('|', expand=True)], axis=1).head(10)

In [None]:
movies.genres.str.split('|', expand=).tolist()

In [None]:
genres = []

for i in movies.genres.str.split('|'):
    for genre in i:
        if genre != '(no genres listed)':
            genres.append(genre)
    
genres = list(set(genres))

In [None]:
['movieId'] + genres

In [None]:
movie_genres = pd.DataFrame(columns=['movieId'] + genres)
movie_genres.head(10)

In [None]:
data = []

for index, movie in movies.iterrows():
    row = {}
    
    row['movieId'] = movie['movieId']
    
    for genre in movie['genres'].split('|'):
        row[genre] = 1
    
    data.append(row)
    

In [None]:
movie_genres = pd.DataFrame(columns=['movieId'] + genres, data=data)
movie_genres.head()