In [120]:
#CodSoft Internship
#Task 2: Movie rating prediction with Python
#Build a model that predicts the rating of a movie based on features like genre, director, and actors. Use regression techniques to tackle this problem.
#The goal is to analyze historical movie data and develop a model that accurately estimates the rating given to a movie by users or critics.
#Movie rating prediction project enables you to explore data analysis, preprocessing, feature engineering, and machine learning modeling techniques. It provides insights into the factors that influence movie ratings and allows you to build a model that can estimate the ratings of movies accurately.
#Import necessary libraries
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import ydata_profiling as pandas_profiling
from plotly.subplots import make_subplots
from plotly.offline import iplot

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
sns.set_theme(palette=sns.color_palette("pastel"), style="whitegrid")

In [121]:
# Load the dataset downloaded from Kaggle.com
df = pd.read_csv('IMDb Movies India.csv', encoding= 'Latin1')

In [122]:
# View the data
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [124]:
#Generate a pandas profiling report 
report = pandas_profiling.ProfileReport(df)
display(report)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [125]:
# Perform data cleaning
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [126]:
# Perform removing of Nan values
df = df.dropna(subset=['Year', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Rating'])

In [127]:
df = df.dropna().reset_index(drop=True)

In [128]:
# Let us change the duration and votes to numeric
df['Duration'] = df.loc[:, 'Duration'].str.replace('min', '')
df['Duration'] = pd.to_numeric(df.loc[:, 'Duration'])

In [129]:
df['Votes'] = df.loc[:, 'Votes'].str.replace(",", "")
df['Votes'] = pd.to_numeric(df.loc[:, 'Votes'])


In [130]:
#data preprocessing and removing duplicates
df.drop_duplicates(inplace=True)

In [131]:
# Cleaning genre and year columns and adding new genre column for further analysis
df['Year'] = df.loc[:, 'Year'].str.extract(r"(\d{4})")
df['Year'] = pd.to_numeric(df.loc[:, 'Year'])

In [132]:
df['M_genre'] = df.loc[:, 'Genre'].str.extract("(^\w{1,11})")
df['M_genre'].unique()

array(['Drama', 'Comedy', 'Horror', 'Action', 'Crime', 'Thriller',
       'Adventure', 'Sport', 'Biography', 'Documentary', 'Mystery',
       'Musical', 'Romance', 'Fantasy', 'Sci', 'Family', 'History',
       'Animation', 'War', 'Music'], dtype=object)

In [133]:
df['Main_genre'] = df.loc[:, 'Genre'].str.extract('(^\w{1,11})')

In [134]:
df.describe()

Unnamed: 0,Year,Duration,Rating,Votes
count,5659.0,5659.0,5659.0,5659.0
mean,1996.24757,133.439124,5.898533,2697.649585
std,19.741839,25.319939,1.381165,13651.503584
min,1931.0,21.0,1.1,5.0
25%,1983.0,119.0,5.0,30.0
50%,2002.0,135.0,6.1,131.0
75%,2013.0,150.0,6.9,922.5
max,2021.0,321.0,10.0,591417.0


In [135]:
df.shape

(5659, 12)

In [136]:
# Distribution of movies across genre
df.Genre.value_counts().reset_index()

Unnamed: 0,Genre,count
0,Drama,844
1,"Drama, Romance",332
2,"Action, Crime, Drama",329
3,"Action, Drama",206
4,"Comedy, Drama",205
...,...,...
371,"Action, Comedy, War",1
372,"Mystery, Sci-Fi",1
373,"Horror, Romance, Sci-Fi",1
374,"Romance, Musical, Drama",1


In [137]:
genres = df.Main_genre.value_counts().reset_index()
iplot(px.pie(data_frame=genres, names=genres.Main_genre, values=genres['count'], title='Number of movies by Genre', height=1050)\
    .update_traces(textinfo='value+percent'))

In [138]:
#Filling the missing values
df['Rating'].fillna(df['Rating'].mean(), inplace=True)
df['Votes'].fillna(0, inplace=True)

In [139]:
# Checking for any remaining missing values
df.isnull().sum()

Name          0
Year          0
Duration      0
Genre         0
Rating        0
Votes         0
Director      0
Actor 1       0
Actor 2       0
Actor 3       0
M_genre       0
Main_genre    0
dtype: int64

In [140]:
#feature and target separation
x = df[['Year', 'Duration', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = df['Rating']

In [141]:
# Handling categorical variables through one-hot encoding
x =pd.get_dummies(x, columns=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], drop_first=True)

In [142]:
#display the transformed features
x.head()

Unnamed: 0,Year,Duration,"Genre_Action, Adventure","Genre_Action, Adventure, Biography","Genre_Action, Adventure, Comedy","Genre_Action, Adventure, Crime","Genre_Action, Adventure, Drama","Genre_Action, Adventure, Family","Genre_Action, Adventure, Fantasy","Genre_Action, Adventure, History",...,Actor 3_Zara Shah,Actor 3_Zareen Khan,Actor 3_Zarine Ali,Actor 3_Zayed Khan,Actor 3_Zeenat Aman,Actor 3_Zeeshan Khan,Actor 3_Zeishan Quadri,Actor 3_Zenobia Shroff,Actor 3_Zoya Hussain,Actor 3_Zulfi Sayed
0,2019,109,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2019,110,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1997,147,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2005,142,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2012,82,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [143]:
#Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [144]:
f'Training set size:{x_train.shape[0]}, Testing set size:{x_test.shape[0]}'

'Training set size:4527, Testing set size:1132'

In [145]:
df.groupby('Main_genre')['Rating'].mean().sort_values(ascending = False)

Main_genre
Documentary    7.608333
Music          7.466667
History        7.225000
Biography      6.697619
Sci            6.325000
Family         6.296154
Fantasy        6.251613
Drama          6.248697
Adventure      6.181905
Crime          6.124723
Musical        6.095556
Animation      6.072500
Mystery        5.854237
Comedy         5.838423
Sport          5.800000
Romance        5.598742
Action         5.511985
Thriller       5.332584
Horror         4.687500
War            4.333333
Name: Rating, dtype: float64

In [146]:
# Bar chart of average ratings by genre
genre_avg = df.groupby('Main_genre')['Rating'].mean().sort_values(ascending=False)
iplot(px.bar(data_frame=genre_avg.reset_index(), x='Main_genre', y='Rating', 
             title='Average Ratings by Genre', labels={'Main_genre': 'Genre', 'Rating': 'Average Rating'}, height=1050)
      .update_traces(texttemplate='%{y:.2f}', textposition='outside'))


In [147]:
fig = px.box(df, x='Main_genre', y='Rating', 
             title='Interactive Rating Distribution by Genre',
             points='all')

fig.update_layout(
    xaxis_title='Movie Genre',
    yaxis_title='Rating',
    title_font=dict(size=20, family='Arial', color='black'),
    xaxis_tickangle=-45,
    yaxis_tickangle=0,
)

fig.show()

In [148]:

import plotly.graph_objs as go

# Scatter plot of directors experience vs average rating
# Calculate the average rating for each director
director_avg = df.groupby('Director')['Rating'].mean().sort_values(ascending=False)

# Get the number of movies directed by each director
movie_counts = df.groupby('Director')['Name'].count()

# Create a scatter plot using plotly
data = [go.Scatter(x=director_avg.index, y=director_avg.values, mode='markers',
                   marker=dict(size=movie_counts.values * 5,  # Scale marker size by the number of movies directed
                               color='green'),  # Set marker color
                   text=[f"Movies: {num}" for num in movie_counts.values])]  # Hover text

# Customize the layout
layout = go.Layout(title="Directors Experience vs Average Rating",
                   xaxis=dict(title="Director", tickangle=45),
                   yaxis=dict(title="Average Rating"),
                   hovermode='closest')  # Show hover information for the closest point

# Create the figure and display it using iplot
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [149]:
#Filter data for movies directed by Rajkumar Hirani
Khan_movie_directors = df[df['Director']=='Rajkumar Hirani'].sort_values(by='Name')

#Create a line plot for the movies directed by Rajkumar Hirani
data = [go.Scatter(x=Khan_movie_directors['Name'], y=Khan_movie_directors['Rating'], mode='lines+markers', #Shows lines and markers                      
                    marker=dict(size=10, color='orange'),  # Set marker size and color
                    text=Khan_movie_directors['Name'],
                    line=dict(width=2))]  # Hover text
#Customize the layout
layout = go.Layout(title="Movies Directed by Rajkumar Hirani",
                   xaxis=dict(title="Movie Name", tickangle=45),
                   yaxis=dict(title="Rating"),
                   hovermode='closest') # Show hover information for the closest point

#Create the figure and display it using iplot
fig = go.Figure(data=data, layout=layout)
iplot(fig)




In [150]:
# Create a scatter plot of rating vs duration
fig = px.scatter(df, x='Duration', y='Rating', color='Main_genre',
                 title='Movie Rating vs Duration by Genre',
                 labels={'Duration': 'Duration (minutes)', 'Rating': 'Rating'},
                 hover_data=['Name'])
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black')))
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Duration (minutes)',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()


In [151]:
# Create a scatter plot of rating vs votes
fig = px.scatter(df, x='Votes', y='Rating', color='Main_genre',
                 title='Movie Rating vs Votes by Genre',
                 labels={'Votes': 'Votes', 'Rating': 'Rating'},
                 hover_data=['Name'])   
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black')))
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Votes',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()


In [152]:
# Create a scatter plot of rating vs year
fig = px.scatter(df, x='Year', y='Rating', color='Main_genre',
                 title='Movie Rating vs Year by Genre',
                 labels={'Year': 'Year', 'Rating': 'Rating'},
                 hover_data=['Name'])
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black')))
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Year',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()


In [153]:
# Create a scatter plot of rating vs actor 1
fig = px.scatter(df, x='Actor 1', y='Rating', color='Main_genre',
                 title='Movie Rating vs Actor 1 by Genre',
                 labels={'Actor 1': 'Actor 1', 'Rating': 'Rating'},
                 hover_data=['Name'])
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black')))
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Actor 1',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()
# Create a scatter plot of rating vs actor 2
fig = px.scatter(df, x='Actor 2', y='Rating', color='Main_genre',
                 title='Movie Rating vs Actor 2 by Genre',
                 labels={'Actor 2': 'Actor 2', 'Rating': 'Rating'},
                 hover_data=['Name'])
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black')))
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Actor 2',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()
# Create a scatter plot of rating vs actor 3
fig = px.scatter(df, x='Actor 3', y='Rating', color='Main_genre',
                 title='Movie Rating vs Actor 3 by Genre',
                 labels={'Actor 3': 'Actor 3', 'Rating': 'Rating'},
                 hover_data=['Name'])
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black')))
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Actor 3',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()
# Create a scatter plot of rating vs director
fig = px.scatter(df, x='Director', y='Rating', color='Main_genre',
                 title='Movie Rating vs Director by Genre',
                 labels={'Director': 'Director', 'Rating': 'Rating'},
                 hover_data=['Name'])
fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='black'))) 
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  xaxis_title='Director',
                  yaxis_title='Rating',
                  legend_title_text='Genre',
                  xaxis_tickangle=-45,
                  yaxis_tickangle=0)
fig.show()