In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree


In [2]:
#LOADING THE DATASET
raw_df = pd.read_csv("DATA_SOURCE//Marvel_DC_imdb.csv")
raw_df.head(2)


Unnamed: 0.1,Unnamed: 0,Movie,Year,Genre,RunTime,Rating,Director,Actor,Description,IMDB_Score,Metascore,Votes,USA_Gross,Category
0,0,Eternals,(2021),"Action,Adventure,Drama",,,ChloéZhao,"AngelinaJolie,GemmaChan,RichardMadden,BarryKeo...","The saga of the Eternals, a race of immortal b...",,,,,Marvel
1,1,Loki,(2021– ),"Action,Adventure,Fantasy",,,,"TomHiddleston,OwenWilson,SophiaDiMartino,Richa...",A new Marvel chapter with Loki at its center.,,,,,Marvel


In [3]:
raw_df.dtypes

Unnamed: 0       int64
Movie           object
Year            object
Genre           object
RunTime         object
Rating          object
Director        object
Actor           object
Description     object
IMDB_Score     float64
Metascore      float64
Votes           object
USA_Gross       object
Category        object
dtype: object

In [4]:
raw_df.Category.value_counts()

DC        1301
Marvel     389
Name: Category, dtype: int64

In [5]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1690 entries, 0 to 1689
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   1690 non-null   int64  
 1   Movie        1690 non-null   object 
 2   Year         1657 non-null   object 
 3   Genre        1683 non-null   object 
 4   RunTime      1407 non-null   object 
 5   Rating       1463 non-null   object 
 6   Director     1474 non-null   object 
 7   Actor        1660 non-null   object 
 8   Description  1690 non-null   object 
 9   IMDB_Score   1486 non-null   float64
 10  Metascore    65 non-null     float64
 11  Votes        1486 non-null   object 
 12  USA_Gross    64 non-null     object 
 13  Category     1690 non-null   object 
dtypes: float64(2), int64(1), object(11)
memory usage: 185.0+ KB


In [6]:
##Data Cleaning

In [7]:
# missing values
round(100*(raw_df.isnull().sum())/len(raw_df), 2)

Unnamed: 0      0.00
Movie           0.00
Year            1.95
Genre           0.41
RunTime        16.75
Rating         13.43
Director       12.78
Actor           1.78
Description     0.00
IMDB_Score     12.07
Metascore      96.15
Votes          12.07
USA_Gross      96.21
Category        0.00
dtype: float64

Since the NaN value of the mentioned two columns are high we are dropping it.
USA_Gross 96.21
Metascore 96.15

In [8]:
raw_df1 = raw_df.drop(['USA_Gross', 'Metascore'], axis=1)
raw_df1.head(2)

Unnamed: 0.1,Unnamed: 0,Movie,Year,Genre,RunTime,Rating,Director,Actor,Description,IMDB_Score,Votes,Category
0,0,Eternals,(2021),"Action,Adventure,Drama",,,ChloéZhao,"AngelinaJolie,GemmaChan,RichardMadden,BarryKeo...","The saga of the Eternals, a race of immortal b...",,,Marvel
1,1,Loki,(2021– ),"Action,Adventure,Fantasy",,,,"TomHiddleston,OwenWilson,SophiaDiMartino,Richa...",A new Marvel chapter with Loki at its center.,,,Marvel


In [9]:
# missing values
round(100*(raw_df1.isnull().sum())/len(raw_df1), 2)


Unnamed: 0      0.00
Movie           0.00
Year            1.95
Genre           0.41
RunTime        16.75
Rating         13.43
Director       12.78
Actor           1.78
Description     0.00
IMDB_Score     12.07
Votes          12.07
Category        0.00
dtype: float64

In [10]:
raw_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1690 entries, 0 to 1689
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   1690 non-null   int64  
 1   Movie        1690 non-null   object 
 2   Year         1657 non-null   object 
 3   Genre        1683 non-null   object 
 4   RunTime      1407 non-null   object 
 5   Rating       1463 non-null   object 
 6   Director     1474 non-null   object 
 7   Actor        1660 non-null   object 
 8   Description  1690 non-null   object 
 9   IMDB_Score   1486 non-null   float64
 10  Votes        1486 non-null   object 
 11  Category     1690 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 158.6+ KB


In [11]:
#label Encoding (Handling String values)

In [12]:
# import preprocessing from sklearn
from sklearn import preprocessing
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()
# 2/3. FIT AND TRANSFORM


In [13]:
# use df.apply() to apply le.fit_transform to all columns
raw_df2 = raw_df1.apply(le.fit_transform)
marvelvsdc_df_2.head(6)


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

In [None]:
categorical = list(raw_df1.select_dtypes(include=['object']).columns.values)
for cat in categorical:
    print(cat)
    raw_df1[cat].fillna('UNK', inplace=True)
    raw_df1[cat] = le.fit_transform(raw_df1[cat])
    #df[cat] = le.fit_transform(df[cat].astype(str))

In [None]:
raw_df1["IMDB_Score"] = le.fit_transform(raw_df1["IMDB_Score"])

In [None]:
raw_df1.head(10)

In [None]:
# missing values
round(100*(raw_df1.isnull().sum())/len(raw_df1), 2)

Imputation Approach with KNNImputer
k-Nearest Neighbors (kNN) can help to impute the values of missing data. Sociologists and
community researchers suggest that human beings live in a community because neighbors
generate a feeling of security and safety, attachment to community, and relationships that bring
out a community identity through participation in various activities.
A similar imputation methodology that works on data is k-Nearest Neighbours (kNN) that
identifies the neighboring points through a measure of distance and the missing values can be
estimated using completed values of neighboring observations.



In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5,weights='uniform',metric='nan_euclidean')
imputer.fit_transform(raw_df1)

In [None]:
raw_df1.head(2)

In [None]:
raw_df1.isnull().sum()

In [None]:
raw_df1.value_counts()


In [None]:
raw_df1.Category.value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("dark_background")
plt.figure(figsize= [5,10])

#Apply matplotlib functionalities
#Change the colour of bins to green
#Change the number of bins
#Create a distribution plot for rating
#import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
sns.distplot(raw_df1.Rating, bins = 40, color = "yellow")
plt.title("Distribution of Rating overs the Graph", fontsize = 20, fontweight = 10)
plt.show()

In [None]:
plt.figure(figsize= [11,7])
sns.heatmap(raw_df1.corr(),annot= True)

Rescaling the Values -- We will use MinMax scaling.


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
raw_df1.columns

In [None]:
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['Unnamed: 0', 'Movie', 'Year', 'Genre', 'RunTime', 'Rating', 'Director',
            'Actor', 'Description', 'IMDB_Score', 'Votes', 'Category']
raw_df1[num_vars] = scaler.fit_transform(raw_df1[num_vars])
raw_df1.head()


In [None]:
#Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split
# Putting feature variable to X
X = raw_df1.drop(['Rating'], axis=1)
X.head()


In [None]:
# Putting response variable to y
y = raw_df1['Rating']
y.head()

In [None]:
y.value_counts()

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
                                                    test_size=0.3, random_state=4)

Model Building

Let's start by splitting our data into a training set and a test set

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Running RFE with the output number of the variable equal to 10
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 11) # running RFE
rfe = rfe.fit(X_train, y_train)



In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))


In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:

X_train.columns[~rfe.support_]

In [None]:
# Build a third fitted model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_2 = sm.OLS(y_train, X_train_lm).fit()


# Print the summary of the model
print(lr_2.summary())

Checking VIF
Variance Inflation Factor or VIF, gives a basic quantitative idea about how much the
feature variables are correlated with each other. It is an extremely important parameter to
test our linear model. The formula for calculating VIF is:
Features VIF is

V IFi =1/1−Ri2

One of the common ways to check for multicollinearity is the Variance Inflation Factor (VIF):

VIF=1, Very Less Multicollinearity
VIF<5, Moderate Multicollinearity
VIF>5, Extreme Multicollinearity (This is what we have to avoid)
Compute the VIF scores:

In [None]:
# Check for the VIF values of the feature variables.
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a dataframe that will contain the names of all the feature variables and th
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


In [None]:
X_train = X_train.drop(['Category'], axis=1)


In [None]:
# Create a dataframe that will contain the names of all the feature variables and th
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


In [None]:
# Build a third fitted model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_2 = sm.OLS(y_train, X_train_lm).fit()
# Print the summary of the model
print(lr_2.summary())

In [None]:
X_train = X_train.drop(['Actor'], axis=1)
# Build a third fitted model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_2 = sm.OLS(y_train, X_train_lm).fit()
# Print the summary of the model
print(lr_2.summary())

In [None]:
X_train = X_train.drop(['Movie'], axis=1)
# Build a third fitted model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_2 = sm.OLS(y_train, X_train_lm).fit()

# Print the summary of the model
print(lr_2.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and th
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


In [None]:
X_train = X_train.drop(['IMDB_Score'], axis=1)
X_train = X_train.drop(['RunTime'], axis=1)

In [None]:
# Create a dataframe that will contain the names of all the feature variables and th
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


In [None]:
# Build a third fitted model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_2 = sm.OLS(y_train, X_train_lm).fit()
# Print the summary of the model
print(lr_2.summary())

In [None]:
X_train = X_train.drop(['Unnamed: 0'], axis=1)
# Build a third fitted model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_2 = sm.OLS(y_train, X_train_lm).fit()
# Print the summary of the model
print(lr_2.summary())

Residual Analysis of the train data

So, now to check if the error terms are also normally distributed (which is infact, one of
the major assumptions of linear regression), let us plot the histogram of the error terms
and see what it looks like.

In [None]:
y_train_cnt = lr_2.predict(X_train_lm)

In [None]:
# Importing the required libraries for plots.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_cnt), bins = 20)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18)

In [None]:
#Making Predictions
X_test.columns
X_test.head(2)

In [None]:
# Now let's use our model to make predictions.
# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train.columns]
# Adding a constant variable
X_test_new = sm.add_constant(X_test_new)
# Making predictions
y_pred = lr_2.predict(X_test_new)
y_pred


In [None]:
lr_2.params


In [None]:
#Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20) # Plot heading
plt.xlabel('y_test', fontsize=18) # X-label
plt.ylabel('y_pred', fontsize=16) # Y-label

In [None]:
# Converting y_test to dataframe
X_test_df = pd.DataFrame(X_test)
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_pred)

# Let's see the head
y_pred_1.head()


In [None]:
# Putting CustID to index
X_test_df['ID'] = X_test_df.index

# Removing index for both dataframes to append them side by side
y_pred_1.reset_index(drop=True, inplace=True)
X_test_df.reset_index(drop=True, inplace=True)

# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([X_test_df, y_pred_1],axis=1)
y_pred_final.head()

In [None]:
# Renaming the column
y_pred_final= y_pred_final.rename(columns={ 0 : 'Rating'})
y_pred_final.head(3)
