In [None]:
# import libraries
import pandas as pd 
import numpy as np
from scipy import sparse 
from sklearn.metrics import mean_squared_error, cosine_similarity
from sklearn import cross_validation as cv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from math import sqrt

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading in dataframes
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

#### ratings df

In [None]:
ratings.head(2)

In [None]:
ratings.shape

In [None]:
ratings.info()

In [None]:
# tally number of users
# 610 users
viewers = ratings['userId'].unique()
len(viewers)

In [None]:
# adverage rating 3.5 with a maximum movie rating of 5.0
ratings.describe()

In [None]:
# majority of the ratings lay between 2.5 and 4
ratings['rating'].hist(bins=50)

In [None]:
# ratings distribution

#### movies df

In [None]:
# no meta data! 
movies.head()

In [None]:
movies.shape

In [None]:
movies.info()

In [None]:
# tally number of movies 9737
# number of movies out number the users may lead to noise in the data set 
# since there are many possible non rated movies
films = movies['title'].unique()
len(films)

#### merge

In [None]:
# dropping timestamp
ratings = pd.merge(movies, ratings).drop(['timestamp'], axis= 1)
ratings.head()

#### pivot_table

In [None]:
# numerous NaN's many unrated or low-level rated films as expected
user_ratings = ratings.pivot_table(index=['userId'], columns=['title'], values='rating')
user_ratings.head()

#### dropna() with  threshold

In [None]:
# remove any films with < 10 users rating, and remaing NaN changed to 0 values
# threshold can be easily adjusted to allow for more films, this is occuring on column data axis 1
user_ratings = user_ratings.dropna(thresh= 10, axis=1).fillna(0,axis=1)
user_ratings.head(5)

#### similarity matrix

In [None]:
# to compute a similarity score three options are available: euclidean, correlation (pearson), and cosine
tfV = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfV_matrix = tfV.fit_transform(movies['genres'])

cosine_similarity = linear_kernel(tfV_matrix, tfV_matrix)

In [None]:
# corr(pearson) method adjusts for the mean by default so no further need to standardize. 
similarity_matrix = user_ratings.corr(method='pearson')
similarity_matrix.head(50)

#### making predictions

Cold start genre-based recommendations

In [None]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# only title parameter
def genre_based_recommendations(title):
    idx = indices[title]
    similarity_scores = list(enumerate(cosine_similarity[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]
    movie_indices = [i[0] for i in similarity_scores]

    return titles.iloc[movie_indices]

User ratings-based recommendations 

In [None]:
def get_similar_movies(movie_name, user_rating):
    # weighted user ratings 
    similar_score = similarity_matrix[movie_name] * (user_rating-2.5)
    # valid values at top of the returned values
    similar_score = similar_score.sort_values(ascending= False)
    
    return similar_score

In [None]:
user = [('101 Dalmatians (1996)', 1), ('2001: A Space Odyssey (1968)', 4)]
#def rating_based_movie(movie, rating):

# empty dataframe to hold values
similar_movies = pd.DataFrame()
    
for movie, rating in user:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index = True)
    
    
similar_movies.sum().sort_values(ascending=False).head(10)

In [None]:
# sns.pairplot()
sns.pairplot(rating_based_movie)

In [None]:
# genre based recommendations
genre_based_recommendations('40 Days and 40 Nights (2002)')

In [None]:
# sns.pairplot()
sns.pairplot(genre_based_recommendations)

#### evaluation