In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Get the data
movies_df = pd.read_csv('movies.csv')
#Ratings from users
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


## Processing

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Remove the year from the title

In [5]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


#### Turn the genre values into a list of genres

In [6]:
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [7]:
moviesWithGenres_df = movies_df.copy()
#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#Filling in the NaN values with 0
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Remove the timestamp from he ratings df
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


## The System

#### Input User

In [11]:
user_input = [
            {'title':'Lion King, The', 'rating':4.2},
    {'title':'Demolition Man', 'rating':4},
    {'title':'Terminator 2: Judgment Day', 'rating':4.7},
    {'title':'Terminator, The', 'rating':2.7},
    {'title':'Matrix, The', 'rating':5},
    {'title':'Déjà Vu', 'rating':5},
    {'title':'Robocop', 'rating':4},
    {'title':'Thor', 'rating':3},
    {'title':'Dude, Where\'s My Car', 'rating':4},
    {'title':'Law Abiding Citizen', 'rating':5}
         ] 
input_movies = pd.DataFrame(user_input)
input_movies

Unnamed: 0,title,rating
0,"Lion King, The",4.2
1,Demolition Man,4.0
2,Terminator 2: Judgment Day,4.7
3,"Terminator, The",2.7
4,"Matrix, The",5.0
5,Déjà Vu,5.0
6,Robocop,4.0
7,Thor,3.0
8,"Dude, Where's My Car",4.0
9,Law Abiding Citizen,5.0


In [12]:
#Filtering out the movies by title
input_id = movies_df[movies_df['title'].isin(input_movies['title'].tolist())]

input_movies = pd.merge(input_id, input_movies)

#Dropping information we won't use from the input dataframe
input_movies = input_movies.drop('genres', 1).drop('year', 1)

input_movies

Unnamed: 0,movieId,title,rating
0,364,"Lion King, The",4.2
1,442,Demolition Man,4.0
2,589,Terminator 2: Judgment Day,4.7
3,1240,"Terminator, The",2.7
4,2175,Déjà Vu,5.0
5,2571,"Matrix, The",5.0
6,71838,Law Abiding Citizen,5.0
7,86332,Thor,3.0


In [13]:
#Filtering out the movies from the input
user_movies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(input_movies['movieId'].tolist())]
user_movies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
360,364,"Lion King, The","[Adventure, Animation, Children, Drama, Musica...",1994,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
438,442,Demolition Man,"[Action, Adventure, Sci-Fi]",1993,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
583,589,Terminator 2: Judgment Day,"[Action, Sci-Fi]",1991,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1212,1240,"Terminator, The","[Action, Sci-Fi, Thriller]",1984,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2092,2175,Déjà Vu,"[Drama, Romance]",1997,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2487,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14366,71838,Law Abiding Citizen,"[Drama, Thriller]",2009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17074,86332,Thor,"[Action, Adventure, Drama, Fantasy, IMAX]",2011,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
