In [None]:
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
movies = pd.read_csv('data/movies.csv')
cr = pd.read_csv('data/credits.csv')

df = movies.merge(cr, left_on='id', right_on='movie_id')
df = df.drop(columns=[
    'homepage', 'status', 'original_title', 'overview',
    'spoken_languages', 'tagline', 'title_y', 'movie_id',
    'production_countries', 'video', 'adult'
], errors='ignore')


In [None]:
def extract_names(x):
    try:
        return [d['name'] for d in json.loads(x)]
    except:
        return []

def extract_director(x):
    try:
        crew_list = json.loads(x)
        for d in crew_list:
            if d['job'] == 'Director':
                return d['name']
    except:
        return ''

df['genres'] = df['genres'].apply(extract_names)
df['keywords'] = df['keywords'].apply(extract_names)
df['production_companies'] = df['production_companies'].apply(extract_names)
df['cast'] = df['cast'].apply(lambda x: extract_names(x)[:3])
df['director'] = df['crew'].apply(extract_director)


In [None]:
df['soup'] = (
    df['genres'].apply(lambda x: ' '.join(x)) + ' ' +
    df['keywords'].apply(lambda x: ' '.join(x)) + ' ' +
    df['production_companies'].apply(lambda x: ' '.join(x)) + ' ' +
    df['cast'].apply(lambda x: ' '.join(x)) + ' ' +
    df['director'].fillna('')
)


In [None]:
le = LabelEncoder()
df['original_language'] = le.fit_transform(df['original_language'])

df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['is_weekend_release'] = (df['release_date'].dt.dayofweek >= 5).astype(int)


In [None]:
df = df[(df['budget'] > 0) & (df['revenue'] > 0)]
df['success'] = (df['revenue'] > 1.5 * df['budget']).astype(int)


In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')
soup_matrix = cv.fit_transform(df['soup']).toarray()

num_cols = ['budget', 'popularity', 'runtime', 'original_language', 'release_year', 'release_month', 'is_weekend_release']
X_num = df[num_cols].fillna(0)

scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

X = np.hstack((X_num_scaled, soup_matrix))
y = df['success']

df = df.reset_index(drop=True)
row_indices = df.index


In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, row_indices, test_size=0.2, random_state=42, stratify=y
)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)

print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
titles_test = df.loc[idx_test, 'title_x'].reset_index(drop=True)

comparison_df = pd.DataFrame({
    'Movie Title': titles_test,
    'Actual': y_test.reset_index(drop=True),
    'Predicted': y_pred
})

wrong_preds = comparison_df[comparison_df['Actual'] != comparison_df['Predicted']]
print("❌ Wrong predictions:")
print(wrong_preds.head(10))

correct = (comparison_df['Actual'] == comparison_df['Predicted']).sum()
total = len(comparison_df)
print(f"✅ Correct Predictions: {correct}/{total} ({100 * correct/total:.2f}%)")

sns.countplot(data=comparison_df, x='Actual', hue='Predicted')
plt.title("Actual vs Predicted Class Distribution")
plt.show()
