### TASK - 2 : Movie Rating Prediction

In [1]:
#Import Libraries for data processing and modelling

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv('IMDb Movies India.csv', encoding='unicode_escape')

In [3]:
# Dataset First Look

df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
# get the shape of the dataset

df.shape

(15509, 10)

In [5]:
# check the null values

df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [7]:
# count the duplicated values

df.duplicated().sum()

6

In [8]:
# clean the null and duplicate values

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [9]:
# check the final rows
df.shape

(5659, 10)

In [10]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [11]:
# Replacing the brackets from year column

df['Year'] = df['Year'].str.replace(r'[()]', '', regex=True).astype(int)

In [12]:
# Remove the min word from 'Duration' column and convert all values to numeric

df['Duration'] = pd.to_numeric(df['Duration'].str.replace(' min', ''))

In [13]:
df.Genre

1                            Drama
3                  Comedy, Romance
5           Comedy, Drama, Musical
6              Drama, Romance, War
8        Horror, Mystery, Thriller
                   ...            
15493                        Drama
15494    Biography, Drama, History
15503         Action, Crime, Drama
15505                Action, Drama
15508                Action, Drama
Name: Genre, Length: 5659, dtype: object

In [14]:
# Splitting the genre by (,) to keep only unique genres and replacing the null values with mode

df['Genre'] = df['Genre'].str.split(', ')
df = df.explode('Genre')
df.Genre = df['Genre'].fillna(df['Genre'].mode()[0])

In [15]:
df.Genre

1          Drama
3         Comedy
3        Romance
5         Comedy
5          Drama
          ...   
15503      Drama
15505     Action
15505      Drama
15508     Action
15508      Drama
Name: Genre, Length: 11979, dtype: object

In [16]:
# Convert 'Votes' to numeric and replace the , to keep only numerical part

df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', ''))

In [17]:
# Checking the dataset is there any null values present and data types of the features present

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11979 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      11979 non-null  object 
 1   Year      11979 non-null  int32  
 2   Duration  11979 non-null  int64  
 3   Genre     11979 non-null  object 
 4   Rating    11979 non-null  float64
 5   Votes     11979 non-null  int64  
 6   Director  11979 non-null  object 
 7   Actor 1   11979 non-null  object 
 8   Actor 2   11979 non-null  object 
 9   Actor 3   11979 non-null  object 
dtypes: float64(1), int32(1), int64(2), object(6)
memory usage: 982.7+ KB


In [18]:
# Importing libraries for model building

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

In [19]:
# Dropping Name column because it doesn't impact the outcome
df.drop('Name', axis = 1, inplace = True)

In [20]:
# Grouping the columns with their average rating and then creating a new feature

df['Genre_mean_rating'] = df.groupby('Genre')['Rating'].transform('mean')
df['Director_encoded'] = df.groupby('Director')['Rating'].transform('mean')
df['Actor1_encoded'] = df.groupby('Actor 1')['Rating'].transform('mean')
df['Actor2_encoded'] = df.groupby('Actor 2')['Rating'].transform('mean')
df['Actor3_encoded'] = df.groupby('Actor 3')['Rating'].transform('mean')

In [21]:
# Splitting the dataset into training and testing parts

X = df[[ 'Year', 'Votes', 'Duration', 'Genre_mean_rating','Director_encoded','Actor1_encoded', 'Actor2_encoded', 'Actor3_encoded']]
Y = df.Rating.copy()
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [22]:
# Building Linear Regression model and training them

model = LinearRegression()
model.fit(X_train,y_train)
model_prediction = model.predict(X_test)

In [23]:
# Evaluating the performance of model with evaluation metrics

print('Mean squared error: ',mean_squared_error(y_test, model_prediction))
print('Mean absolute error: ',mean_absolute_error(y_test, model_prediction))
print('R2 score: ',r2_score(y_test, model_prediction))

Mean squared error:  0.4465441653985702
Mean absolute error:  0.4921902540765641
R2 score:  0.7641133663863862


In [24]:
# For testing, We create a new dataframe with values close to the any of our existing data to evaluate.

data = {'Year': [2022], 'Votes': [81], 'Duration': [117], 'Genre_mean_rating': [7.1], 'Director_encoded': [6.7], 'Actor1_encoded': [5.5], 'Actor2_encoded': [4.8], 'Actor3_encoded': [8.7]}
test_data = pd.DataFrame(data)

In [25]:
# Predict the movie rating by entered data
rating_prediction = model.predict(test_data)

# Display the predicted result from the Model
print("Predicted Rating:", round(rating_prediction[0], 1))
     

Predicted Rating: 6.8
