In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
print('pandas version', pd.__version__)
print('numpy version', np.__version__)
print('seaborn version', sns.__version__)
import warnings
# warnings.filterwarnings('ignore')

pandas version 2.0.3
numpy version 1.24.3
seaborn version 0.12.2


In [3]:
df = pd.read_csv('IMDb Movies India.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


    In this dataset, we notice the following:
- The Year is supposed to be a number but the entries are object
- Same remarques for the Duration and Votes Column
- All the column contains null values which we need to handle accordingly 

In [38]:
def drop_null(dataset):
    features = ['Name', 'Year', 'Genre', 'Director', 'Rating', 'Actor 1', 'Actor 2', 'Actor 3']
    dataset.dropna(subset=features, inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    
    
def reformat(dataset):
    dataset['Name'] = dataset['Name'].str.extract('([A-Za-z\s\'\-]+)')
    dataset['Year'] = dataset['Year'].str.strip('()').astype(float)
    dataset['Duration'] = dataset['Duration'].str.rstrip(' min').astype(float)
    
    dataset['Genre'] = dataset['Genre'].str.split(', ')
    dataset = dataset.explode('Genre')
    dataset['Genre'].fillna(dataset['Genre'].mode()[0], inplace=True)
    
    dataset['Votes'] = dataset['Votes'].astype(float)
    dataset['Director'] = dataset['Director'].str.extract('([A-Za-z\s\'\-]+)')
    dataset['Actor 1'] = dataset['Actor 1'].str.extract('([A-Za-z\s\'\-]+)')
    dataset['Actor 2'] = dataset['Actor 2'].str.extract('([A-Za-z\s\'\-]+)')
    dataset['Actor 3'] = dataset['Actor 3'].str.extract('([A-Za-z\s\'\-]+)')
    
    return dataset

def fill_missing(dataset):
    dataset['Votes'].fillna(dataset['Votes'].mean(), inplace=True)
    dataset['Rating'].fillna(dataset['Rating'].mean(), inplace=True)
    dataset['Duration'].fillna(dataset['Duration'].mean(), inplace=True)
    dataset.reset_index(drop = True, inplace = True)

def cleaning_pipeline(dataset):
    drop_null(dataset)
    dataset = reformat(dataset)
    dataset = dataset.drop_duplicates(subset=['Name'], keep=False)
    dataset.reset_index(drop=True, inplace=True)
    fill_missing(dataset)
    return dataset

In [43]:
dataset = df.copy()
dataset = cleaning_pipeline(dataset)


In [45]:
dataset.describe()

Unnamed: 0,Year,Duration,Rating,Votes
count,2183.0,2183.0,2183.0,2183.0
mean,1995.847916,123.200551,5.831012,391.024279
std,20.028,20.472116,1.425445,3614.739255
min,1931.0,45.0,1.6,5.0
25%,1984.0,117.0,4.9,11.0
50%,1999.0,123.200551,6.0,23.0
75%,2014.0,132.0,6.8,70.5
max,2021.0,300.0,9.6,101014.0


# Feature enginnering

In [56]:
dataset.drop('Name', axis = 1, inplace = True)

In [57]:
dataset

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,2019.0,109.000000,7,7.0,8.0,358,Rasika Dugal,Vivek Ghamande,Arvind Jangid
1,2004.0,96.000000,7,6.2,17.0,615,Rati Agnihotri,Gulshan Grover,Atul Kulkarni
2,2016.0,120.000000,11,5.9,59.0,135,Pankaj Berry,Anubhav Dhir,Hritu Dudani
3,2005.0,116.000000,7,7.1,1002.0,125,Shabana Azmi,Konkona Sen Sharma,Rahul Bose
4,1993.0,168.000000,7,5.6,15.0,1279,Ronit Roy,Tisca Chopra,Shakti Kapoor
...,...,...,...,...,...,...,...,...,...
2178,2000.0,140.000000,7,5.7,7.0,1190,Sunil Sukthankar,Mita Vashisht,Milind Gunaji
2179,2018.0,100.000000,7,5.7,78.0,1120,Shashank Arora,Prince Daniel,Shatakshi Gupta
2180,1996.0,123.200551,0,4.7,29.0,38,Mahesh Anand,Vikas Anand,Bob Christo
2181,2015.0,115.000000,7,6.1,408.0,701,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana


In [12]:
# def find_unique_actors(dataset):
#     actors_concatenated = pd.concat([dataset['Actor 1'], dataset['Actor 2'], dataset['Actor 3']], ignore_index=True)
    
#     actors = pd.DataFrame({'Actor' : actors_concatenated})
#     actors.drop_duplicates(inplace=True)
#     actors.reset_index(drop = True, inplace = True)

#     return actors
# unique_actors = find_unique_actors(dataset)
# actor_dict = {}
# for i, actor in enumerate(unique_actors['Actor']):
#     actor_dict[actor] = i+1
    
# dataset['Actor 1'] = dataset['Actor 1'].map(actor_dict)
# dataset['Actor 2'] = dataset['Actor 2'].map(actor_dict)
# dataset['Actor 3'] = dataset['Actor 3'].map(actor_dict)


In [13]:
# def split_genre(dataset):
#     for index, row in dataset.iterrows():
#         genres = row['Genre'].split(', ')
#         for i, genre in enumerate(genres):
#             dataset.at[index, f'Genre {i+1}'] = genre

# split_genre(df_test)


In [68]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()


# dataset['Director'] = label_encoder.fit_transform(dataset['Director'])
# dataset['Genre'] = label_encoder.fit_transform(dataset['Genre'])

# # director_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# # genre_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# # this is a list of the data used to encode the Director feature of the dataset


# dictionary to store mappings
encoded_mappings = {}

columns_to_encode = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

for column in columns_to_encode:
    # Fit and transform the column to numerical values
    dataset[column] = label_encoder.fit_transform(dataset[column])
    
    unique_labels = label_encoder.classes_
    
    # Create a dictionary to map numerical values to labels
    mapping = {i: label for i, label in enumerate(unique_labels)}
    
    # Add the mapping to the dictionary of mappings
    encoded_mappings[column] = mapping

# Now encoded_mappings contains a dictionary for each encoded column
# print(encoded_mappings) # This will print out a large set of mapped values, so dont activate it unless you want to visualise them at whole.

# Prediction model

In [69]:

X = dataset[[ 'Year', 'Votes', 'Duration', 'Genre','Director','Actor 1', 'Actor 2', 'Actor 3']]
Y = dataset['Rating']

In [70]:
from sklearn.model_selection import train_test_split

In [71]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42) 

In [72]:
X_train

Unnamed: 0,Year,Votes,Duration,Genre,Director,Actor 1,Actor 2,Actor 3
812,2018.0,166.0,106.000000,7,721,448,1075,1270
256,2015.0,13.0,68.000000,2,1291,1231,824,1117
2083,1990.0,12.0,135.000000,0,410,265,847,459
1710,2017.0,8.0,54.000000,7,219,1103,323,516
561,2015.0,20.0,120.000000,0,157,537,702,795
...,...,...,...,...,...,...,...,...
1638,2017.0,61.0,112.000000,15,381,922,266,642
1095,2018.0,41.0,65.000000,7,123,635,1039,938
1130,2018.0,7.0,92.000000,4,1077,972,1305,746
1294,1987.0,9.0,123.200551,13,105,233,666,168


In [73]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [74]:
linear_regression = LinearRegression()
Rnd_forest_reg = RandomForestRegressor() # Defining our model
# Rnd_forest_classifier = RandomForestClassifier()

In [75]:
linear_regression_fit = linear_regression.fit(X_train, Y_train)
linear_regression_pred = linear_regression.predict(X_test)

Rnd_forest_reg_fit = Rnd_forest_reg.fit(X_train, Y_train)
Rnd_forest_reg_pred = Rnd_forest_reg.predict(X_test)

# Rnd_forest_classifier_fit =Rnd_forest_classifier.fit(X_train, Y_train)
# Rnd_forest_classifier_pred = Rnd_forest_classifier.predict(X_test)

In [76]:
from sklearn.metrics import r2_score

In [77]:
print('R2 score: ',r2_score(Y_test, linear_regression_pred))
print('R2 score: ', r2_score(Y_test, Rnd_forest_reg_pred))

R2 score:  0.02668165258912014
R2 score:  0.30375539481177705
