In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [6]:
# cell 2: naive bayes class
class NaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = None
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples, n_features = X.shape
        
        for cls in self.classes:
            X_c = X[y == cls]
            self.class_priors[cls] = X_c.shape[0] / n_samples
        
        for cls in self.classes:
            X_c = X[y == cls]
            self.feature_probs[cls] = {}
            
            for feature_idx in range(n_features):
                feature_values = np.unique(X[:, feature_idx])
                self.feature_probs[cls][feature_idx] = {}
                
                for value in feature_values:
                    count = np.sum(X_c[:, feature_idx] == value)
                    total = X_c.shape[0]
                    self.feature_probs[cls][feature_idx][value] = (count + 1) / (total + len(feature_values))
    
    def predict_proba(self, X):
        n_samples, n_features = X.shape
        probabilities = np.zeros((n_samples, len(self.classes)))
        
        for idx, sample in enumerate(X):
            for cls_idx, cls in enumerate(self.classes):
                log_prob = np.log(self.class_priors[cls])
                
                for feature_idx in range(n_features):
                    feature_value = sample[feature_idx]
                    feature_probs = self.feature_probs[cls][feature_idx]
                    likelihood = feature_probs.get(feature_value, 1e-6)
                    log_prob += np.log(likelihood)
                
                probabilities[idx, cls_idx] = log_prob
        
        probabilities = np.exp(probabilities)
        probabilities /= probabilities.sum(axis=1, keepdims=True)
        return probabilities
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return self.classes[np.argmax(probabilities, axis=1)]

In [None]:

csv_file = "../30movies_dataset.csv"

df = pd.read_csv(csv_file)
print(f"Dataset loaded successfully!")


MOVIE SUCCESS PREDICTION USING NAIVE BAYES
Dataset loaded successfully!


In [10]:
# here convert all titles to lowercase, split into words, collect into a set
vocab = set()
for title in df['Title']:
    words = title.lower().split()
    vocab.update(words)

vocab_size = len(vocab)
print("VOCABULARY STATISTICS (UNIQUE WORDS IN TITLES)")
print(f"Vocabulary size: {vocab_size}")
print(f"Sample words: {list(vocab)[:20]} ...")
print(f"Total records: {len(df)}")
print(f"\nColumns: {list(df.columns)}")

VOCABULARY STATISTICS (UNIQUE WORDS IN TITLES)
Vocabulary size: 57
Sample words: ['rises', 'dune', 'maverick', 'amadeus', 'road', 'whispering', 'water', 'knight', 'the', 'wall', 'last', 'laughing', 'street', 'get', 'max:', 'eternal', 'blue', 'dark', 'iron', 'gravity'] ...
Total records: 30

Columns: ['Title', 'Year', 'Genre', 'Director', 'Lead Actor', 'Production Company', 'Runtime (min)', 'Country of Origin', 'Original Language', 'Gross Revenue (million)', 'Success']


In [13]:
# cell 5: creating Success Label and Clean Data
mean_revenue = df['Gross Revenue (million)'].mean()
df['Success'] = (df['Gross Revenue (million)'] >= mean_revenue).astype(int)

# Remove rows with missing values
df_original_size = len(df)
df = df.dropna()
print(f"\nRows after removing missing values: {len(df)} (removed {df_original_size - len(df)})")
print(f"Total movies: {len(df)}")
print(f"Successful movies (revenue >= mean): {df['Success'].sum()} ({df['Success'].sum()/len(df)*100:.1f}%)")
print(f"Unsuccessful movies (revenue < mean): {len(df) - df['Success'].sum()} ({(len(df) - df['Success'].sum())/len(df)*100:.1f}%)")
print(f"Mean revenue: ${mean_revenue:,.2f}")


Rows after removing missing values: 30 (removed 0)
Total movies: 30
Successful movies (revenue >= mean): 9 (30.0%)
Unsuccessful movies (revenue < mean): 21 (70.0%)
Mean revenue: $416.58


In [14]:
label_encoder_genre = df[['Genre']].drop_duplicates().reset_index(drop=True)
label_encoder_genre['GenreId'] = label_encoder_genre.index
genre_to_id = dict(zip(label_encoder_genre['Genre'], label_encoder_genre['GenreId']))
print(f"Unique Genres: {len(genre_to_id)}")

label_encoder_director = df[['Director']].drop_duplicates().reset_index(drop=True)
label_encoder_director['DirectorId'] = label_encoder_director.index
director_to_id = dict(zip(label_encoder_director['Director'], label_encoder_director['DirectorId']))
print(f"Unique Directors: {len(director_to_id)}")

label_encoder_actor = df[['Lead Actor']].drop_duplicates().reset_index(drop=True)
label_encoder_actor['ActorId'] = label_encoder_actor.index
actor_to_id = dict(zip(label_encoder_actor['Lead Actor'], label_encoder_actor['ActorId']))
print(f"Unique Lead Actors: {len(actor_to_id)}")

label_encoder_production = df[['Production Company']].drop_duplicates().reset_index(drop=True)
label_encoder_production['ProductionId'] = label_encoder_production.index
production_to_id = dict(zip(label_encoder_production['Production Company'], label_encoder_production['ProductionId']))
print(f"Unique Production Companies: {len(production_to_id)}")

label_encoder_country = df[['Country of Origin']].drop_duplicates().reset_index(drop=True)
label_encoder_country['CountryId'] = label_encoder_country.index
country_to_id = dict(zip(label_encoder_country['Country of Origin'], label_encoder_country['CountryId']))
print(f"Unique Countries: {len(country_to_id)}")

label_encoder_language = df[['Original Language']].drop_duplicates().reset_index(drop=True)
label_encoder_language['LanguageId'] = label_encoder_language.index
language_to_id = dict(zip(label_encoder_language['Original Language'], label_encoder_language['LanguageId']))
print(f"Unique Languages: {len(language_to_id)}")


Unique Genres: 26
Unique Directors: 24
Unique Lead Actors: 27
Unique Production Companies: 21
Unique Countries: 13
Unique Languages: 4


In [None]:
# Cell 7: appling encoding to dataframe
df['GenreId'] = df['Genre'].map(genre_to_id)
df['DirectorId'] = df['Director'].map(director_to_id)
df['ActorId'] = df['Lead Actor'].map(actor_to_id)
df['ProductionId'] = df['Production Company'].map(production_to_id)
df['CountryId'] = df['Country of Origin'].map(country_to_id)
df['LanguageId'] = df['Original Language'].map(language_to_id)

df.to_csv('processed_movies_data.csv', index=False)
print("\nProcessed data saved to: 'processed_movies_data.csv'")


Processed data saved to: 'processed_movies_data.csv'


In [18]:
print("DATA PREVIEW")
print(df[['Title', 'Year', 'Genre', 'Director', 'Success', 'Gross Revenue (million)']].head(10))

feature_columns = ['Year', 'GenreId', 'DirectorId', 'ActorId', 'ProductionId', 'Runtime (min)', 'CountryId', 'LanguageId']
X = df[feature_columns].values
y = df['Success'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("TRAIN-TEST SPLIT")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
print(f"Number of features: {X_train.shape[1]}")

DATA PREVIEW
                      Title  Year                  Genre  \
0  The Grand Budapest Hotel  2014           Comedy-Drama   
1                  Parasite  2019  Thriller, Dark Comedy   
2                      1917  2019             War, Drama   
3        Mad Max: Fury Road  2015         Action, Sci-Fi   
4                La La Land  2016       Musical, Romance   
5                   Get Out  2017       Horror, Thriller   
6              The Revenant  2015       Adventure, Drama   
7                      Coco  2017   Animation, Adventure   
8                      Dune  2021      Sci-Fi, Adventure   
9        The Shape of Water  2017         Fantasy, Drama   

                Director  Success  Gross Revenue (million)  
0           Wes Anderson        0                    174.8  
1           Bong Joon-ho        0                    258.8  
2             Sam Mendes        0                    384.9  
3          George Miller        0                    378.9  
4        Damien Chaze

In [19]:

model = NaiveBayes()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("MODEL PERFORMANCE METRICS")
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("CONFUSION MATRIX")
print(conf_matrix)
print("\nInterpretation:")
print(f"True Negatives (Correctly predicted unsuccessful): {conf_matrix[0][0]}")
print(f"False Positives (Incorrectly predicted successful): {conf_matrix[0][1]}")
print(f"False Negatives (Incorrectly predicted unsuccessful): {conf_matrix[1][0]}")
print(f"True Positives (Correctly predicted successful): {conf_matrix[1][1]}")

MODEL PERFORMANCE METRICS
Accuracy: 0.6667 (66.67%)
Precision: 0.3333
Recall: 1.0000
F1 Score: 0.5000
CONFUSION MATRIX
[[3 2]
 [0 1]]

Interpretation:
True Negatives (Correctly predicted unsuccessful): 3
False Positives (Incorrectly predicted successful): 2
False Negatives (Incorrectly predicted unsuccessful): 0
True Positives (Correctly predicted successful): 1


In [None]:
print("CLASSIFICATION REPORT")
class_report = classification_report(y_test, y_pred, target_names=['Not Successful', 'Successful'])
print(class_report)


id_to_genre = {v: k for k, v in genre_to_id.items()}
id_to_director = {v: k for k, v in director_to_id.items()}
id_to_actor = {v: k for k, v in actor_to_id.items()}
id_to_production = {v: k for k, v in production_to_id.items()}
id_to_country = {v: k for k, v in country_to_id.items()}
id_to_language = {v: k for k, v in language_to_id.items()}


df_test = pd.DataFrame(X_test, columns=feature_columns)
df_test['success_probability'] = y_pred_proba[:, 1]
df_test['predicted_success'] = y_pred
df_test['actual_success'] = y_test

df_test['Genre'] = df_test['GenreId'].map(id_to_genre)
df_test['Director'] = df_test['DirectorId'].map(id_to_director)
df_test['Lead Actor'] = df_test['ActorId'].map(id_to_actor)
df_test['Production Company'] = df_test['ProductionId'].map(id_to_production)
df_test['Country'] = df_test['CountryId'].map(id_to_country)
df_test['Language'] = df_test['LanguageId'].map(id_to_language)

results_df = df_test[['Year', 'Genre', 'Director', 'Lead Actor', 'Production Company', 'Runtime (min)', 
                       'Country', 'Language', 'success_probability', 'predicted_success', 'actual_success']].copy()

results_df['correct_prediction'] = (results_df['predicted_success'] == results_df['actual_success']).astype(int)


class_report = classification_report(y_test, y_pred, target_names=['Not Successful', 'Successful'])
print(class_report)
print("All predictions saved to: 'movie_success_predictions_naive_bayes.csv'")



CLASSIFICATION REPORT
                precision    recall  f1-score   support

Not Successful       1.00      0.60      0.75         5
    Successful       0.33      1.00      0.50         1

      accuracy                           0.67         6
     macro avg       0.67      0.80      0.62         6
  weighted avg       0.89      0.67      0.71         6

                precision    recall  f1-score   support

Not Successful       1.00      0.60      0.75         5
    Successful       0.33      1.00      0.50         1

      accuracy                           0.67         6
     macro avg       0.67      0.80      0.62         6
  weighted avg       0.89      0.67      0.71         6

All predictions saved to: 'movie_success_predictions_naive_bayes.csv'


In [None]:

print("PREDICTION ANALYSIS")
successful_predicted = results_df[results_df['predicted_success'] == 1]
successful_actual = results_df[results_df['actual_success'] == 1]
correct_predictions = results_df[results_df['correct_prediction'] == 1]

print(f"Total test samples: {len(results_df)}")
print(f"Predicted Successful: {len(successful_predicted)}")
print(f"Actually Successful: {len(successful_actual)}")
print(f"Correct Predictions: {len(correct_predictions)} ({len(correct_predictions)/len(results_df)*100:.1f}%)")
print(f"Incorrect Predictions: {len(results_df) - len(correct_predictions)} ({(len(results_df) - len(correct_predictions))/len(results_df)*100:.1f}%)")

correct_successful = results_df[(results_df['predicted_success'] == 1) & (results_df['actual_success'] == 1)]
print(f"Correctly Predicted as Successful: {len(correct_successful)}")


top_predictions = results_df.nlargest(10, 'success_probability')
print(top_predictions[['Year', 'Genre', 'Director', 'Lead Actor', 'success_probability', 
                        'predicted_success', 'actual_success', 'correct_prediction']].to_string())

top_predictions.to_csv('top_10_success_predictions.csv', index=False)
print("\nTop 10 predictions saved to: 'top_10_success_predictions.csv'")


correct_pred_df = results_df[results_df['correct_prediction'] == 1]
incorrect_pred_df = results_df[results_df['correct_prediction'] == 0]


if len(correct_pred_df) > 0:
    print(f"Average probability for CORRECT predictions: {correct_pred_df['success_probability'].mean():.4f}")
else:
    print("No correct predictions found")

if len(incorrect_pred_df) > 0:
    print(f"Average probability for INCORRECT predictions: {incorrect_pred_df['success_probability'].mean():.4f}")
else:
    print("No incorrect predictions found")

correct_pred_df.to_csv('correct_predictions.csv', index=False)
incorrect_pred_df.to_csv('incorrect_predictions.csv', index=False)

print("1. processed_movies_data.csv - Processed dataset with encodings")
print("2. movie_success_predictions_naive_bayes.csv - All predictions")
print("3. top_10_success_predictions.csv - Top 10 most likely successful movies")
print("4. correct_predictions.csv - Correctly predicted movies")
print("5. incorrect_predictions.csv - Incorrectly predicted movies")


PREDICTION ANALYSIS
Total test samples: 6
Predicted Successful: 3
Actually Successful: 1
Correct Predictions: 4 (66.7%)
Incorrect Predictions: 2 (33.3%)
Correctly Predicted as Successful: 1
   Year              Genre            Director         Lead Actor  success_probability  predicted_success  actual_success  correct_prediction
1  2012   Action, Thriller   Christopher Nolan     Christian Bale             0.921640                  1               1                   1
2  2018            Fantasy  Guillermo del Toro      Maribel Verdú             0.583333                  1               0                   0
3  2014       Drama, Music     Damien Chazelle       Miles Teller             0.536766                  1               0                   0
5  2017     Fantasy, Drama  Guillermo del Toro      Sally Hawkins             0.424084                  0               0                   1
0  2014          Biography         James Marsh       Kate Winslet             0.260526              