In [30]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [31]:
pd.set_option("display.max_columns", 50)

In [32]:
# Read the CSV file into a Pandas DataFrame
movies_df = pd.read_csv(
    Path('Data/filmtitles.csv')   
)

# Review the DataFrame
movies_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,production_countries,imdb_id,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,US,tt0075314,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,US,tt0068473,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,GB,tt0071853,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"GB,US",tt0061578,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,GB,tt0079470,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3424,tm1066324,Super Monsters: Once Upon a Rhyme,MOVIE,The Super Monsters rethink exemplary fantasies...,2021,,25,,tt14586752,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3425,tm1097142,My Bride,MOVIE,The story follows a young man and woman who go...,2021,,93,EG,tt14216488,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3426,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,NG,tt13857480,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3427,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,,tt11803618,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
movies_df['description_length'] = movies_df['description'].str.count(' ') + 1

In [34]:
movies_df['release_year'].unique()

array([1976, 1972, 1975, 1967, 1979, 1971, 1980, 1961, 1966, 1954, 1958,
       1977, 1969, 1963, 1956, 1960, 1973, 1974, 1959, 1978, 1990, 1987,
       1984, 1989, 1983, 1986, 1981, 1988, 1985, 1982, 1996, 1995, 1997,
       1994, 2000, 1999, 1998, 1992, 1993, 1991, 2007, 2004, 2002, 2003,
       2010, 2006, 2005, 2008, 2009, 2001, 2012, 2011, 2014, 2013, 2016,
       2015, 2018, 2017, 2019, 2020, 2022, 2021], dtype=int64)

In [35]:
# Create bins in which to place values based upon release year
bins = [0, 1959, 1969, 1979, 1989, 1999, 2009, 2019, 2029]

# Create labels for these bins
group_labels = ["1950's", "1960's", "1970's", "1980's", "1990's",
                "2000's", "2010's", "2020's"]

In [36]:
# Slice the data and place it into bins
movies_df["release_decade"] = pd.cut(movies_df["release_year"], bins, labels=group_labels)
movies_df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,production_countries,imdb_id,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western,description_length,release_decade
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,US,tt0075314,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,28.0,1970's
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,US,tt0068473,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,32.0,1970's
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,GB,tt0071853,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,69.0,1970's
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"GB,US",tt0061578,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,49.0,1960's
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,GB,tt0079470,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.0,1970's


In [37]:
# Dropping columns that are not needed
movies_df_dropped = movies_df.drop(columns=['id', 'title', 'type', 'description', 'production_countries', 'imdb_id', 'release_year'])

In [38]:
# Encode the categorical variables using get_dummies
movies_dummies = pd.get_dummies(movies_df_dropped)
movies_dummies

Unnamed: 0,runtime,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western,description_length,age_certification_G,age_certification_NC-17,age_certification_PG,age_certification_PG-13,age_certification_R,release_decade_1950's,release_decade_1960's,release_decade_1970's,release_decade_1980's,release_decade_1990's,release_decade_2000's,release_decade_2010's,release_decade_2020's
0,114,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,28.0,False,False,False,False,True,False,False,True,False,False,False,False,False
1,109,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,32.0,False,False,False,False,True,False,False,True,False,False,False,False,False
2,91,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,69.0,False,False,True,False,False,False,False,True,False,False,False,False,False
3,150,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,49.0,False,False,False,False,False,False,True,False,False,False,False,False,False
4,94,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.0,False,False,False,False,True,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3424,25,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,16.0,False,False,False,False,False,False,False,False,False,False,False,False,True
3425,93,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,32.0,False,False,False,False,False,False,False,False,False,False,False,False,True
3426,100,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,15.0,False,False,False,False,False,False,False,False,False,False,False,False,True
3427,134,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,21.0,False,False,False,False,False,False,False,False,False,False,False,False,True


In [39]:
movies_dummies = movies_dummies.dropna()
movies_dummies

Unnamed: 0,runtime,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western,description_length,age_certification_G,age_certification_NC-17,age_certification_PG,age_certification_PG-13,age_certification_R,release_decade_1950's,release_decade_1960's,release_decade_1970's,release_decade_1980's,release_decade_1990's,release_decade_2000's,release_decade_2010's,release_decade_2020's
0,114,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,28.0,False,False,False,False,True,False,False,True,False,False,False,False,False
1,109,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,32.0,False,False,False,False,True,False,False,True,False,False,False,False,False
2,91,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,69.0,False,False,True,False,False,False,False,True,False,False,False,False,False
3,150,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,49.0,False,False,False,False,False,False,True,False,False,False,False,False,False
4,94,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.0,False,False,False,False,True,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3424,25,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,16.0,False,False,False,False,False,False,False,False,False,False,False,False,True
3425,93,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,32.0,False,False,False,False,False,False,False,False,False,False,False,False,True
3426,100,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,15.0,False,False,False,False,False,False,False,False,False,False,False,False,True
3427,134,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,21.0,False,False,False,False,False,False,False,False,False,False,False,False,True


In [40]:
# Dropping rows that are lacking data
movies_dummies = movies_dummies[movies_dummies['runtime'] > 40]
movies_dummies = movies_dummies[movies_dummies['imdb_votes'] > 30]

In [41]:
# Seperate the features, X,  from the target variable, y
y = movies_dummies['imdb_score']
X = movies_dummies.drop(columns='imdb_score', axis=1)

In [42]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [43]:
# Scale Data
# Instantiate a StandardScaler instance
scaler = MinMaxScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [44]:
# Initialize the linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

model = LinearRegression()

In [45]:
# Fit the model to the training data
model.fit(X_train_scaled, y_train)

In [46]:
# Make predictions on the testing data
predictions = model.predict(X_test_scaled)

In [47]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.8778039923693705


In [48]:
# Evaluate the model using mean absolute error
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.7234853590624398


In [49]:
# Make predictions using the X set
X_scaled = X_scaler.transform(X)

predicted_y_values = model.predict(X_scaled)

In [50]:
# Merge the original df with dummies df on index
movie_predictions = pd.merge(movies_dummies, movies_df, left_index=True, right_index=True)

# Select 'id', 'title', and 'imdb_score' columns
movie_predictions = movie_predictions[['id', 'title', 'imdb_score_x']]

# Rename 'imdb_score' column
movie_predictions = movie_predictions.rename(columns={'imdb_score_x': 'imdb_score'})

# Add Predictions to the df
movie_predictions['predicted_score'] = np.round(predicted_y_values, 1)

# Set 'title' as index
movie_predictions.set_index('id', inplace=True)

movie_predictions.head(10)

Unnamed: 0_level_0,title,imdb_score,predicted_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tm84618,Taxi Driver,8.2,8.4
tm154986,Deliverance,7.7,6.2
tm127384,Monty Python and the Holy Grail,8.2,6.9
tm120801,The Dirty Dozen,7.7,7.5
tm70993,Life of Brian,8.0,7.0
tm14873,Dirty Harry,7.7,6.2
tm119281,Bonnie and Clyde,7.7,7.3
tm98978,The Blue Lagoon,5.8,6.1
tm44204,The Guns of Navarone,7.5,7.8
tm67378,The Professionals,7.3,6.5


In [51]:
# Save the dataframe to a CSV file
movie_predictions.to_csv('Outputs/LinearRegression_model_4_LinearRegression.csv', index=False)

In [52]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X_test_scaled, y_test, sample_weight=None)
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
std = np.std(y_test)

# Print relevant metrics.
print(f"Model 4 - LinearRegression - MinMax Scaling - Feature Changes")
print()
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

Model 4 - LinearRegression - MinMax Scaling - Feature Changes

The score is 0.24697255454982814.
The r2 is 0.24697255454982814.
The mean squared error is 0.8778039923693705.
The root mean squared error is 0.9369119448322614.
The standard deviation is 1.0796758153034185.


In [53]:
movie_predictions['predicted_score'].max()

10.7

In [54]:
r2_test = r2_score(y_test, predictions)
r2_test

0.24697255454982814