In [2]:
# Import dependencies

import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Import and read csv files
import pandas
movies = pandas.read_csv('movies.csv')
ratings = pandas.read_csv('ratings.csv')

In [4]:
# Display movies
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Display ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
# Combine movies data and ratings data
movies_ratings = pd.merge(ratings, movies, on='movieId')
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
# Average rating of each movie
movies_ratings.groupby('title')['rating'].mean().head()

title
'71 (2014)                                 4.0
'Hellboy': The Seeds of Creation (2004)    4.0
'Round Midnight (1986)                     3.5
'Salem's Lot (2004)                        5.0
'Til There Was You (1997)                  4.0
Name: rating, dtype: float64

In [9]:
# Sort avg. ratings in ascending order by avg. ratings
movies_ratings.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Gena the Crocodile (1969)              5.0
True Stories (1986)                    5.0
Cosmic Scrat-tastrophe (2015)          5.0
Love and Pigeons (1985)                5.0
Red Sorghum (Hong gao liang) (1987)    5.0
Name: rating, dtype: float64

In [10]:
# Total number of ratings for a movie
movies_ratings.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [13]:
# Create new dataframe with average ratings and number of ratings for a movie
ratings_avg_count = pd.DataFrame(movies_ratings.groupby('title')['rating'].mean())
ratings_avg_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('title')['rating'].count())
ratings_avg_count.head()

Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [26]:
user_movies_rating = movies_ratings.pivot_table(index='userId', columns='title', values='rating')

In [34]:
pulp_fiction_ratings = user_movies_rating['Pulp Fiction (1994)']
pulp_fiction_ratings.fillna(0,inplace=True)

pulp_fiction_ratings.head()

userId
1    3.0
2    0.0
3    0.0
4    1.0
5    5.0
Name: Pulp Fiction (1994), dtype: float64

In [28]:
similar_pulp_fiction = user_movies_rating.corrwith(pulp_fiction_ratings)

In [30]:
pulp_fiction_corr = pd.DataFrame(similar_pulp_fiction, columns=['Correlation'])
pulp_fiction_corr.dropna(inplace=True)
pulp_fiction_corr.head()
#
pulp_fiction_corr.sort_values('Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
Rare Exports: A Christmas Tale (Rare Exports) (2010),1.0
Azumi (2003),1.0
"Maxed Out: Hard Times, Easy Credit and the Era of Predatory Lenders (2006)",1.0
"War Zone, The (1999)",1.0
"Wolfman, The (2010)",1.0
Outlander (2008),1.0
Luxo Jr. (1986),1.0
"Claymation Christmas Celebration, A (1987)",1.0
"Return, The (Vozvrashcheniye) (2003)",1.0
Max Dugan Returns (1983),1.0


In [31]:
pulp_fiction_corr = pulp_fiction_corr.join(ratings_avg_count['rating_counts'])
pulp_fiction_corr.head()

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",-0.187323,17
(500) Days of Summer (2009),-0.044034,42
*batteries not included (1987),-0.738549,7
...And Justice for All (1979),-0.188982,3
10 Cent Pistol (2015),1.0,2


In [32]:
pulp_fiction_corr[pulp_fiction_corr ['rating_counts']>50].sort_values('Correlation', ascending=False).head()

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Pulp Fiction (1994),1.0,307
"Wolf of Wall Street, The (2013)",0.579915,54
Fight Club (1999),0.543465,218
Kill Bill: Vol. 1 (2003),0.504147,131
Interstellar (2014),0.503411,73
