In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

In [82]:
# loading the data set 
data = pd.read_csv("NetflixOriginals.csv",encoding='latin-1')
print(data.columns)
data.head()

Index(['Title', 'Genre', 'Premiere', 'Runtime', 'IMDB Score', 'Language'], dtype='object')


Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [88]:
# getting the information about the data for bettter understanding
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
Title         584 non-null object
Genre         584 non-null int32
Premiere      584 non-null float64
Runtime       584 non-null int64
IMDB Score    584 non-null float64
Language      584 non-null int32
dtypes: float64(2), int32(2), int64(1), object(1)
memory usage: 22.9+ KB
None


In [76]:
# checking for any missing data in the data set
print(data.isnull().values.any())

False


In [77]:
# checking for any duplicate data in the data set
data.duplicated().any()

False

In [83]:
# encoding the categorical features
label_encoder = LabelEncoder()
data['Genre'] = label_encoder.fit_transform(data['Genre'])
data['Language'] = label_encoder.fit_transform(data['Language'])
# encoding the premiere data since they contain month date, year and month date. year
data['Premiere_comma'] = pd.to_datetime(data['Premiere'], format='%B %d, %Y', errors='coerce')
data['Premiere_period'] = pd.to_datetime(data['Premiere'], format='%B %d. %Y', errors='coerce')

# Merge the two columns into a single datetime column
data['Premiere'] = data['Premiere_comma'].combine_first(data['Premiere_period'])

# Drop the temporary columns
data.drop(['Premiere_comma', 'Premiere_period'], axis=1, inplace=True)

# Convert valid datetime values to timestamps and handle NaT values
data['Premiere'] = data['Premiere'].apply(lambda x: x.timestamp() if not pd.isna(x) else None)

data.head()


Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,45,1564963000.0,58,2.5,6
1,Dark Forces,106,1597968000.0,81,2.6,29
2,The App,93,1577318000.0,79,2.6,20
3,The Open House,63,1516320000.0,94,3.2,2
4,Kaali Khuhi,73,1604016000.0,90,3.4,18


In [84]:
# checking for any missing data in the premiere
print(data.isnull().values.any())

False


In [65]:
# Split data into features (X) and target (y)
X = data[['Genre','Premiere','Runtime', 'Language']]
y = data['IMDB Score']

In [66]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# Choose a regression algorithm
model = LinearRegression()
# Initialize the Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor()

In [68]:
# Model Training
model.fit(X_train, y_train)
# Fit the model to your training data
gb_regressor.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [69]:
# Model Evaluation
y_pred = model.predict(X_test)
# Make predictions
predictions = gb_regressor.predict(X_test)

In [87]:
# by using LinearRegression method
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("by using LinearRegression method")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

by using LinearRegression method
Mean Absolute Error: 0.8155336353328352
Mean Squared Error: 0.9944408302850986
R-squared: 0.0419104411781841


In [86]:
# by using Gradient Boosting Regressor
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("by using Gradient Boosting Regressor")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R-squared: {r2}')    

MAE: 0.7163085940172045
MSE: 0.8510487088176035
R-squared: 0.18006093762947117
