In [1]:
# Import dependencies

import os
import numpy as np
import pandas as pd
import hvplot.pandas
import seaborn as sns

from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import and read csv files
import pandas
movies = pandas.read_csv('movies.csv')
ratings = pandas.read_csv('ratings.csv')

In [3]:
# Display movies
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Display ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Combine movies data and ratings data
movies_ratings = pd.merge(ratings, movies, on='movieId')
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [6]:
# Average rating of each movie
movies_ratings.groupby('title')['rating'].mean().head()

title
'71 (2014)                                 4.0
'Hellboy': The Seeds of Creation (2004)    4.0
'Round Midnight (1986)                     3.5
'Salem's Lot (2004)                        5.0
'Til There Was You (1997)                  4.0
Name: rating, dtype: float64

In [7]:
# Sort avg. ratings in ascending order by avg. ratings
movies_ratings.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Gena the Crocodile (1969)              5.0
True Stories (1986)                    5.0
Cosmic Scrat-tastrophe (2015)          5.0
Love and Pigeons (1985)                5.0
Red Sorghum (Hong gao liang) (1987)    5.0
Name: rating, dtype: float64

In [8]:
# Total number of ratings for a movie
movies_ratings.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [9]:
# Create new dataframe with average ratings and number of ratings for a movie
ratings_avg_count = pd.DataFrame(movies_ratings.groupby('title')['rating'].mean())
ratings_avg_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('title')['rating'].count())
ratings_avg_count.head()

Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [10]:
# Create a matrix to find the correlation between the ratings of the movie, add user ID
user_movies_rating = movies_ratings.pivot_table(index='userId', columns='title', values='rating')

In [11]:
# Find all the user ratings for Pulp Fiction
pulp_fiction_ratings = user_movies_rating['Pulp Fiction (1994)']
pulp_fiction_ratings.fillna(0,inplace=True)

pulp_fiction_ratings.head()

userId
1    3.0
2    0.0
3    0.0
4    1.0
5    5.0
Name: Pulp Fiction (1994), dtype: float64

In [12]:
# Retrieve all the movies that are similar to Pulp Fiction
similar_pulp_fiction = user_movies_rating.corrwith(pulp_fiction_ratings)

In [13]:
# Find the correlation between the user ratings for Pulp Fiction
pulp_fiction_corr = pd.DataFrame(similar_pulp_fiction, columns=['Correlation'])
pulp_fiction_corr.dropna(inplace=True)
pulp_fiction_corr.head()
#
pulp_fiction_corr.sort_values('Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
Children of Dune (2003),1.0
Death Wish 3 (1985),1.0
Fanny and Alexander (Fanny och Alexander) (1982),1.0
Spirit: Stallion of the Cimarron (2002),1.0
Gross Anatomy (a.k.a. A Cut Above) (1989),1.0
Wrong Turn 2: Dead End (2007),1.0
Selma (2014),1.0
Mad Love (1995),1.0
The Little Prince (2015),1.0
Written on the Wind (1956),1.0


These movies have high correlations, but are not popular movies and possibly have low ratings

In [14]:
# Retrieve correlated movies and rating counts
pulp_fiction_corr = pulp_fiction_corr.join(ratings_avg_count['rating_counts'])
pulp_fiction_corr.head()

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",-0.287017,17
(500) Days of Summer (2009),-0.088028,42
*batteries not included (1987),-0.287914,7
...And Justice for All (1979),-0.188982,3
10 (1979),0.140028,4


In [15]:
# Sort in descending order movies correlated to Pulp Fiction, that have more than 50 ratings
pulp_fiction_corr[pulp_fiction_corr ['rating_counts']>50].sort_values('Correlation', ascending=False).head()

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Pulp Fiction (1994),1.0,307
Platoon (1986),0.560549,63
Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan (2006),0.41957,65
Casino (1995),0.410596,82
Zombieland (2009),0.378517,53


Data Loading and Visualization

In [32]:
# Create a scatter plot of rating_counts versus the Correlation
rating_plot = pulp_fiction_corr.hvplot.scatter(
    x="rating_counts",
    y="Correlation",
    title="correlation based on high rating counts"
)
rating_plot

Data Preparation

In [33]:
# Reformat data of the independent variable X as a single-column array
X = pulp_fiction_corr["rating_counts"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[17],
       [42],
       [ 7],
       [ 3],
       [ 4]])

In [34]:
# The shape of X is 5581 samples, with a single feature (column)
X.shape

(5581, 1)

In [35]:
# Create an array for the dependent variable y
y = pulp_fiction_corr["Correlation"]

Building the Linear Regression Model

In [36]:
# Create a model with scikit-learn
model = LinearRegression()

In [37]:
# Fit the data into the model
model.fit(X, y)

In [38]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.0008384]


In [39]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -0.08396420317698083


In [40]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -0.08396420317698083 + 0.0008384048479268884X


In [41]:
# Display the formula to predict a movie with a Correlation of 1 with a high rating
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 1")

# Predict a movie with a Correlation of 1 with a high rating
y_1 = model.intercept_ + model.coef_[0] * 1

# Display the prediction
print(f"Predicted a movie with a Correlation of 1 with a high rating: {y_1:.2f}")

Model's formula: y = -0.08396420317698083 + 0.0008384048479268884 * 1
Predicted a movie with a Correlation of 1 with a high rating: -0.08


In [42]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [43]:
# Create a copy of the original data
movie_predicted = pulp_fiction_corr.copy()

# Add a column with the predicted movie values
movie_predicted["predicted"] = predicted_y_values

# Display sample data
movie_predicted.head()

Unnamed: 0_level_0,Correlation,rating_counts,predicted
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"'burbs, The (1989)",-0.287017,17,-0.069711
(500) Days of Summer (2009),-0.088028,42,-0.048751
*batteries not included (1987),-0.287914,7,-0.078095
...And Justice for All (1979),-0.188982,3,-0.081449
10 (1979),0.140028,4,-0.080611


In [44]:
# Sort in descending order
movie_predicted[movie_predicted ['rating_counts']>50].sort_values('Correlation', ascending=False).head()

Unnamed: 0_level_0,Correlation,rating_counts,predicted
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pulp Fiction (1994),1.0,307,0.173426
Platoon (1986),0.560549,63,-0.031145
Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan (2006),0.41957,65,-0.029468
Casino (1995),0.410596,82,-0.015215
Zombieland (2009),0.378517,53,-0.039529


In [45]:
best_fit_line = movie_predicted.hvplot.line(
    x = "rating_counts",
    y = "predicted",
    color = "purple"
)
best_fit_line

In [46]:
# Superpose the original data and the best fit line
rating_plot * best_fit_line

Linear Regression Model Assessment

In [47]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [48]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.0017258833875822832.
The r2 is 0.0017258833875822832.
The mean squared error is 0.3113480535099784.
The root mean squared error is 0.5579857108474897.
The standard deviation is 0.5584678441511506.
