# 🎬 Movie Rating Prediction using Regression

This notebook builds a machine learning model that predicts IMDb ratings based on movie features such as genre, director, and actors using a real-world dataset.

📦 Dataset Source: [IMDb 5000 Movie Dataset](https://www.key2stats.com/data-set/view/1547)

In [None]:
!pip install pandas scikit-learn joblib

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
# Load dataset
url = "https://www.key2stats.com/IMDB_5000_Movie_Dataset_1547_45.csv"
df = pd.read_csv(url)
df.head()

In [None]:
# Clean and preprocess
df = df[['genres', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'duration', 'budget', 'imdb_score']].dropna()
df['actors'] = df['actor_1_name'] + ',' + df['actor_2_name'] + ',' + df['actor_3_name']
df.drop(columns=['actor_1_name', 'actor_2_name', 'actor_3_name'], inplace=True)
df.head()

In [None]:
# Define features and target
cat_features = ['genres', 'director_name', 'actors']
num_features = ['duration', 'budget']
target = 'imdb_score'

X = df[cat_features + num_features]
y = df[target]

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Pipeline setup
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
], remainder='passthrough')

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train model
model.fit(X_train, y_train)

In [None]:
# Evaluation
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"R² Score: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")

In [None]:
# Save model
joblib.dump(model, 'movie_rating_model.pkl')
print("Model saved as 'movie_rating_model.pkl'")