## Feature Engineering

### Import the relevant librarires 

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Load the dataset


In [2]:
imdb_df = pd.read_csv('imdb_df.csv', encoding='latin1')
imdb_df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
1,#Yaaram,2019,110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
2,#Yaaram,2019,110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,...Aur Pyaar Ho Gaya,1997,147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
4,...Aur Pyaar Ho Gaya,1997,147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
...,...,...,...,...,...,...,...,...,...,...
11974,Zulm Ki Zanjeer,1989,125,Drama,5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
11975,Zulmi,1999,129,Action,4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
11976,Zulmi,1999,129,Drama,4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
11977,Zulm-O-Sitam,1998,130,Action,6.2,20,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja


### Drop 'Name' column because it doesn't have any predictive power

In [3]:
imdb_df.drop('Name', axis = 1, inplace = True)
imdb_df.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
1,2019,110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
2,2019,110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,1997,147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
4,1997,147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


## Grouping the columns with their averge rating and then creating a new feature

In [12]:
genre_mean_rating = imdb_df.groupby('Genre')['Rating'].transform('mean')
imdb_df['Genre_mean_rating'] = genre_mean_rating

director_mean_rating = imdb_df.groupby('Director')['Rating'].transform('mean')
imdb_df['Director_encoded'] = director_mean_rating

actor1_mean_rating = imdb_df.groupby('Actor 1')['Rating'].transform('mean')
imdb_df['Actor1_encoded'] = actor1_mean_rating

actor2_mean_rating = imdb_df.groupby('Actor 2')['Rating'].transform('mean')
imdb_df['Actor2_encoded'] = actor2_mean_rating

actor3_mean_rating = imdb_df.groupby('Actor 3')['Rating'].transform('mean')
imdb_df['Actor3_encoded'] = actor3_mean_rating

### inputs and targets

In [32]:
X = imdb_df[['Year','Votes','Duration','Genre_mean_rating', 'Director_encoded',
       'Actor1_encoded', 'Actor2_encoded', 'Actor3_encoded']]

y = imdb_df['Rating']

## train, test


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Building the model

In [36]:
Model = LinearRegression()
Model.fit(X_train,y_train)
Model_pred = Model.predict(X_test)

## Evaluation

In [53]:
print('The performance evaluation og loistic regression is below: ', '\n')
print('Mena squared error: ', mean_squared_error(y_test, Model_pred))
print('Mean absolute error: ', mean_squared_error(y_test, Model_pred))
print('R2 score: ', r2_score(y_test, Model_pred))

print('Accuracy:',  2_score(y_test, Model_pred) * 100, '%' )

The performance evaluation og loistic regression is below:  

Mena squared error:  0.4465441653985704
Mean absolute error:  0.4465441653985704
R2 score:  0.7641133663863862
Accuracy: 76.41133663863862 %


## Model Testing

In [41]:
X.head(5)

Unnamed: 0,Year,Votes,Duration,Genre_mean_rating,Director_encoded,Actor1_encoded,Actor2_encoded,Actor3_encoded
0,2019,8,109,6.056744,7.0,6.85,7.0,7.0
1,2019,35,110,5.751042,4.4,5.25,4.4,4.46
2,2019,35,110,5.811087,4.4,5.25,4.4,4.46
3,1997,827,147,5.751042,5.335135,4.793617,5.73,5.93
4,1997,827,147,6.056744,5.335135,4.793617,5.73,5.93


In [42]:
y.head(5)

0    7.0
1    4.4
2    4.4
3    4.7
4    4.7
Name: Rating, dtype: float64

In [51]:
# For testing, we create a dataframe with values close to the any of our existing data to evaluate.

data = {'Year': [2019], 'Votes': [36], 'Duration': [111], 'Genre_mean_rating': [5.8], 'Director_encoded': [4.5], 'Actor1_encoded': [5.3], 'Actor2_encoded': [4.5], 'Actor3_encoded': [5.1]}
trail = pd.DataFrame(data)

In [52]:
# Predict the movie rating by entered data
rating_predicted = Model.predict(trail)

# Display the predicted result from the model
print('Predicted Rating:', rating_predicted[0])

Predicted Rating: 4.420069225210622
