In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
pd.set_option("display.max_columns", 50)

In [3]:
# Read the CSV file into a Pandas DataFrame
movies_df = pd.read_csv(
    Path('Data/filmtitles.csv')   
)

# Review the DataFrame
movies_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,production_countries,imdb_id,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,US,tt0075314,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,US,tt0068473,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,GB,tt0071853,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"GB,US",tt0061578,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,GB,tt0079470,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3424,tm1066324,Super Monsters: Once Upon a Rhyme,MOVIE,The Super Monsters rethink exemplary fantasies...,2021,,25,,tt14586752,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3425,tm1097142,My Bride,MOVIE,The story follows a young man and woman who go...,2021,,93,EG,tt14216488,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3426,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,NG,tt13857480,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3427,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,,tt11803618,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Dropping columns that are not needed
movies_df_dropped = movies_df.drop(columns=['id', 'title', 'type', 'description', 'production_countries', 'imdb_id'])

In [5]:
# Encode the categorical variables using get_dummies
movies_dummies = pd.get_dummies(movies_df_dropped)
movies_dummies

Unnamed: 0,release_year,runtime,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western,age_certification_G,age_certification_NC-17,age_certification_PG,age_certification_PG-13,age_certification_R
0,1976,114,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
1,1972,109,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,False,False,False,False,True
2,1975,91,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,False,False,True,False,False
3,1967,150,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,False,False,False,False,False
4,1979,94,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3424,2021,25,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False
3425,2021,93,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,False,False,False,False,False
3426,2021,100,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,False,False,False,False,False
3427,2021,134,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False


In [6]:
movies_dummies = movies_dummies.dropna()
movies_dummies

Unnamed: 0,release_year,runtime,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western,age_certification_G,age_certification_NC-17,age_certification_PG,age_certification_PG-13,age_certification_R
0,1976,114,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
1,1972,109,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,False,False,False,False,True
2,1975,91,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,False,False,True,False,False
3,1967,150,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,False,False,False,False,False
4,1979,94,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3424,2021,25,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False
3425,2021,93,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,False,False,False,False,False
3426,2021,100,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,False,False,False,False,False
3427,2021,134,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False


In [7]:
# Seperate the features, X,  from the target variable, y
y = movies_dummies['imdb_score']
X = movies_dummies.drop(columns='imdb_score', axis=1)

In [8]:
# Scale 


In [9]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Initialize the linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()

In [11]:
# Fit the model to the training data
model.fit(X_train, y_train)

In [12]:
# Make predictions on the testing data
predictions = model.predict(X_test)

In [14]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.956159143773177


In [15]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [16]:
# Merge the original df with dummies df on index
movie_predictions = pd.merge(movies_dummies, movies_df, left_index=True, right_index=True)

# Select 'id', 'title', and 'imdb_score' columns
movie_predictions = movie_predictions[['id', 'title', 'imdb_score_x']]

# Rename 'imdb_score' column
movie_predictions = movie_predictions.rename(columns={'imdb_score_x': 'imdb_score'})

# Add Predictions to the df
movie_predictions['predicted_score'] = predicted_y_values

# Set 'title' as index
movie_predictions.set_index('id', inplace=True)

movie_predictions

Unnamed: 0_level_0,title,imdb_score,predicted_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tm84618,Taxi Driver,8.2,8.999891
tm154986,Deliverance,7.7,6.934828
tm127384,Monty Python and the Holy Grail,8.2,7.629700
tm120801,The Dirty Dozen,7.7,7.321004
tm70993,Life of Brian,8.0,7.626344
...,...,...,...
tm1066324,Super Monsters: Once Upon a Rhyme,5.6,5.811263
tm1097142,My Bride,5.0,5.822815
tm1014599,Fine Wine,6.8,5.854442
tm898842,C/O Kaadhal,7.7,6.426417


In [None]:
# Save the dataframe to a CSV file
movie_predictions.to_csv('Outputs/LinearRegression_model_1.csv', index=False)

In [17]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.23642766690967099.
The r2 is 0.23642766690967099.
The mean squared error is 0.9708333991028018.
The root mean squared error is 0.9853087836322184.
The standard deviation is 1.127579735768276.


In [18]:
movie_predictions['predicted_score'].max()

11.206974886141658