In [249]:
#Library Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


import warnings
warnings.filterwarnings('ignore')

In [250]:
#encountered with unicode error so performing exception handling using some encoders I'm aware with
try:
    df = pd.read_csv("Movie dataset.csv", encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv("Movie dataset.csv", encoding='utf-8-sig')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv("Movie dataset.csv", encoding='latin1')
        except UnicodeDecodeError:
            df = pd.read_csv("Movie dataset.csv", encoding='cp1252')

### Let's explore our data to get understanding of different types of aspects

In [251]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [252]:
df.isnull().sum() # checking Null values.

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [253]:
df.shape # shape of our data

(15509, 10)

In [254]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [255]:
df.duplicated().sum()

6

In [256]:
df['Rating'].unique()

array([ nan,  7. ,  4.4,  4.7,  7.4,  5.6,  4. ,  6.2,  5.9,  6.5,  5.7,
        6.3,  7.2,  6.6,  7.3,  7.1,  6.9,  3.5,  5. ,  4.5,  6.4,  4.1,
        4.8,  8.1,  5.5,  6.8,  6.1,  7.7,  5.1,  7.6,  3.1,  3.3,  7.8,
        8.4,  5.2,  4.3,  5.8,  4.6,  7.5,  6.7,  3.6,  3.9,  5.4,  4.2,
        5.3,  3.4,  3. ,  8. ,  6. ,  3.8,  7.9,  2.7,  4.9,  2.4,  3.7,
        3.2,  2.5,  2.8,  2.6,  2.9,  8.2,  8.7,  8.3,  9.3,  8.8,  2.1,
        2.3,  8.5,  8.6,  9. ,  9.6,  1.7,  9.1,  2. ,  1.4,  8.9,  1.9,
        9.4,  9.7,  1.8,  9.2,  1.6, 10. ,  2.2,  1.1])

In [257]:
(df['Rating'].isnull().sum()) & (df['Votes'].isnull().sum())

7588

In [258]:
df.dropna(inplace=True) # After some exploring I decided to drop null values. It seems filling is insignificant to our task

In [259]:
df.isnull().sum()#All Null values removed

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

### Let's Clean our data. It looks like we have unwanted characters in text columns

In [261]:
df['Name'].head(20) 

1     #Gadhvi (He thought he was Gandhi)
3                                #Yaaram
5                   ...Aur Pyaar Ho Gaya
6                              ...Yahaan
8                     ?: A Question Mark
9                               @Andheri
10             1:1.6 An Ode to Lost Love
11                  1:13:7 Ek Tera Saath
12                              100 Days
13                             100% Love
15                           102 Not Out
18                             10ml LOVE
21                            12 O'Clock
22                            12 O'Clock
25                                 127 B
28           13B: Fear Has a New Address
30                        15 Park Avenue
32                           15th August
33                           16 December
34                                 18.11
Name: Name, dtype: object

In [262]:
def clean(text): #function to clean special characters
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [263]:
df['Name'] = df['Name'].apply(clean)# using .apply method to apply our previously created function.

In [264]:
df['Year'] = df['Year'].apply(clean).astype(int) # I decided to convert year to int after cleaning. so type casting Year column

In [265]:
# for col in df.columns: 
  # df[col] = df[col].apply(clean)----- if all columns contains special characters we can use this

In [266]:
def remove_min(text):
    return text.replace(' min', '')
#I felt like this column hosts another type of unwanted guest. so I respectfully sent our guest out.

In [267]:
df['Duration'] = df['Duration'].apply(remove_min).astype(int)#after claeaning I converted object to int.

In [268]:
def Comma(text):
    Pattern = r','
    return re.sub(Pattern,'',text)

In [269]:
df['Votes'] = df['Votes'].apply(Comma).astype(int)#removed (,) from numbers and converted it to integer

In [270]:
df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [271]:
df.info() # now you can see that some Dtypes changed to int from object

<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   int32  
 2   Duration  5659 non-null   int32  
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   int32  
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), int32(3), object(6)
memory usage: 420.0+ KB


In [282]:
#some more imports for ML task.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [275]:
df = pd.get_dummies(df, columns=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'])
#converting Categorical columns to dummy variables.
X = df.drop(columns=['Rating', 'Name'])  # Features
y = df['Rating']  # Target variable

In [280]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [281]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [283]:
# Predict ratings on the testing set
y_pred = model.predict(X_test)

### After all these struggles Our model Performing Poorly. Let's apply some regularization methods to make it perform good

In [285]:
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test,y_pred)
print("R2_score:", r2)

print('RMSE:',np.sqrt(mse))

Mean Squared Error: 8.516158912203164
R2_score: -3.463002525478829
RMSE: 2.918245862192417


### let's apply L1-Lasso,L2-Ridge regularizarions and see what happens.

In [288]:
from sklearn.linear_model import Ridge, Lasso

ridge_model = Ridge(alpha=0.5)  # You can adjust the value of alpha
ridge_model.fit(X_train, y_train)

# Predict ratings on the testing set
y_pred_ridge = ridge_model.predict(X_test)

# Calculate MSE,R2_score,RMSE for Ridge regression
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print("Mean Squared Error (Ridge):", mse_ridge)

r2 = r2_score(y_test,y_pred_ridge)
print("R2_score:", r2)

print('RMSE:',np.sqrt(mse_ridge))

Mean Squared Error (Ridge): 1.6190892815090994
R2_score: 0.15149545389572594
RMSE: 1.2724343918289458


### Hurray! Solved it model seems to be working better than befor.

In [289]:
# Initialize and train the Lasso regression model (L1 regularization)
lasso_model = Lasso(alpha=0.5)  # You can adjust the value of alpha
lasso_model.fit(X_train, y_train)

# Predict ratings on the testing set
y_pred_lasso = lasso_model.predict(X_test)

# Calculate MSE,R2_score,RMSE for Lasso regression
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print("Mean Squared Error (Lasso):", mse_lasso)

r2 = r2_score(y_test,y_pred_lasso)
print("R2_score:", r2)

print('RMSE:',np.sqrt(mse_lasso))

Mean Squared Error (Lasso): 1.7474439790853988
R2_score: 0.08422952504848025
RMSE: 1.3219092174144935
