In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#  **Summary**
 It preprocesses data by handling missing values, converting numerical fields, and encoding categorical variables. Feature engineering includes calculating the director’s success rate and the average rating of similar movies by genre. The model, trained using a Random Forest Regressor, is evaluated using RMSE and R² scores. The dataset is split into training and testing sets to ensure robust performance. Finally, the trained model is saved as a .pkl file for future use. This structured approach enhances predictive accuracy and ensures data-driven insights.

In [17]:
# Import the  Movie Rating  Prediction dataset from the specified CSV file using Pandas
df = pd.read_csv('/content/drive/MyDrive/GROWTH/IMDb Movies India.csv', encoding='ISO-8859-1')
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


# **Basic Metrics**

In [18]:
# Checking Shape of data
df.shape

(15509, 10)

In [19]:
# Columns in data
df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [20]:
# Overview of the dataset structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


# **Data Conversation & Data Cleaning**

In [25]:
df.isna().sum()

Unnamed: 0,0
Name,0
Year,528
Duration,8269
Genre,1877
Rating,7590
Votes,7589
Director,525
Actor 1,1617
Actor 2,2384
Actor 3,3144


In [24]:
# Ensure 'Year' is a string and extract only numeric values
df['Year'] = df['Year'].astype(str).str.extract(r'(\d+)')
# Convert to float, handling errors gracefully
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Ensure 'Duration' is a string and extract only numeric values
df['Duration'] = df['Duration'].astype(str).str.extract(r'(\d+)')

# Convert to numeric, handling errors
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
# Clean 'Votes' column before conversion
df['Votes'] = df['Votes'].str.replace(',', '', regex=True)  # Remove commas
df['Votes'] = df['Votes'].str.replace(r'[^0-9.]', '', regex=True)  # Remove non-numeric characters
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')  # Convert to float, set errors to NaN if conversion fails

# Display the first few rows
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),2019.0,109.0,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021.0,90.0,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019.0,110.0,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010.0,105.0,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [26]:
# Handle missing values
df.fillna({'Rating': df['Rating'].median(), 'Duration': df['Duration'].median(), 'Votes': 0}, inplace=True)
df.dropna(subset=['Director', 'Genre'], inplace=True)# Convert 'Year' and 'Duration' to numeric

In [27]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,131.0,Drama,6.0,0.0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),2019.0,109.0,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021.0,90.0,"Drama, Musical",6.0,0.0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019.0,110.0,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010.0,105.0,Drama,6.0,0.0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


# **Comment**
It extracts numeric values from the "Year" and "Duration" columns, converts them to numeric types, and cleans the "Votes" column by removing commas and non-numeric characters. Errors are handled using errors='coerce', replacing invalid values with NaN. Missing values in "Rating" and "Duration" are filled with their median values, while "Votes" is set to zero. Additionally, rows with missing "Director" or "Genre" values are dropped to maintain data consistency. Finally, df.head() previews the cleaned dataset for verification

In [28]:
# Feature Engineering
# Director Success Rate
director_avg_rating = df.groupby('Director')['Rating'].mean().to_dict()
df['Director_Success'] = df['Director'].map(director_avg_rating)

# Average Rating of Similar Movies
genre_avg_rating = df.groupby('Genre')['Rating'].mean().to_dict()
df['Genre_Avg_Rating'] = df['Genre'].map(genre_avg_rating)

# Encode Categorical Features
label_enc = LabelEncoder()
df['Genre'] = label_enc.fit_transform(df['Genre'])
df['Director'] = label_enc.fit_transform(df['Director'])

# Define Features and Target
features = ['Year', 'Duration', 'Votes', 'Director_Success', 'Genre_Avg_Rating', 'Genre', 'Director']
X = df[features]
y = df['Rating']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [29]:
# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

# Save Model
import joblib
joblib.dump(model, "movie_rating_predictor.pkl")

print("Model saved successfully!")


RMSE: 0.6426983395737864
R² Score: 0.6387357422347543
Model saved successfully!


# **Comment**
This feature engineering approach enhances predictive accuracy by computing the **Director Success Rate**, representing the director’s average movie rating, and the **Average Rating of Similar Movies**, derived from genre-based grouping. Categorical variables are encoded, and a **Random Forest Regressor** is trained and evaluated using RMSE and R². The model is then saved for future use.

# **Conclusion**
The implemented predictive model effectively estimates movie ratings using key attributes such as year, duration, votes, director success rate, and average genre rating. Data preprocessing ensures accuracy by handling missing values and encoding categorical variables. Feature engineering enhances model performance by incorporating director-specific success metrics and genre-based rating averages. The Random Forest Regressor is trained and evaluated using RMSE and R² scores, ensuring robust predictions. The final trained model is saved for future use, enabling data-driven insights into movie rating trends. This structured approach improves predictive accuracy, making it valuable for industry applications in film analytics and recommendation systems.