In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [10]:
# Project: Movie Rating Prediction
# Objective: Predict the IMDb rating of a movie based on features like genre, director, and actors.

# Load the dataset
file_path = "IMDb Movies India.csv"  # Update if needed
df = pd.read_csv(r'C:\Users\MANSI\Documents\task\IMDb Movies India.csv\IMDb Movies India.csv', encoding="ISO-8859-1")

In [11]:
# Data Preprocessing
print("Initial dataset shape:", df.shape)

Initial dataset shape: (15509, 10)


In [12]:
# Drop rows with missing target values (Rating)
df = df.dropna(subset=["Rating"])
print("Dataset shape after dropping missing ratings:", df.shape)

Dataset shape after dropping missing ratings: (7919, 10)


In [13]:
# Fill missing values in categorical columns
categorical_cols = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
df[categorical_cols] = df[categorical_cols].fillna("Unknown")

In [14]:
# Convert 'Votes' to numeric and handle missing values
df["Votes"] = pd.to_numeric(df["Votes"], errors="coerce").fillna(0)

In [15]:
# Extract numerical and categorical features
X_numeric = df[["Votes"]]
y = df["Rating"].astype(float)

In [16]:
# Apply Ordinal Encoding to categorical features
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_categorical = ordinal_encoder.fit_transform(df[categorical_cols])

In [17]:
# Combine numerical and categorical features
X = np.hstack((X_numeric, X_categorical))

In [18]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (6335, 6)
Testing set size: (1584, 6)


In [19]:
# Model Training
print("Training the RandomForestRegressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training completed.")

Training the RandomForestRegressor model...
Model training completed.


In [20]:
# Model Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model Mean Absolute Error (MAE): {mae:.2f}")

Model Mean Absolute Error (MAE): 0.99


In [21]:
# Example Movie Prediction
print("\nExample Movie Prediction")
example_movie = np.array([[50000, 1, 10, 50, 100, 200]])  # Example input
predicted_rating = model.predict(example_movie)[0]
print(f"Predicted Movie Rating: {predicted_rating:.2f}")


Example Movie Prediction
Predicted Movie Rating: 4.70


In [22]:
# Conclusion
print("\nProject Summary:")
print("- The model was trained using IMDb movie data.")
print("- Features like genre, director, and actors were encoded using ordinal encoding.")
print("- A RandomForestRegressor was used to predict movie ratings.")
print(f"- The model achieved an MAE of {mae:.2f}, indicating reasonable prediction accuracy.")
print("- The model can be further improved with more features and hyperparameter tuning.")


Project Summary:
- The model was trained using IMDb movie data.
- Features like genre, director, and actors were encoded using ordinal encoding.
- A RandomForestRegressor was used to predict movie ratings.
- The model achieved an MAE of 0.99, indicating reasonable prediction accuracy.
- The model can be further improved with more features and hyperparameter tuning.
