In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt

# Load the dataset
file_path = r"C:\Users\Hp\Desktop\BUS Assignments\capstone\IMDB Top 250 Movies.csv"
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Feature Engineering - Label encoding for categorical variables
le_cast = LabelEncoder()
le_director = LabelEncoder()
le_writer = LabelEncoder()

df['casts'] = le_cast.fit_transform(df['casts'])
df['directors'] = le_director.fit_transform(df['directors'])
df['writers'] = le_writer.fit_transform(df['writers'])

# Define features and target
features = ['casts', 'directors', 'writers', 'budget', 'rating', 'run_time']
X = df[features]
y = df['box_office']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"  RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"  MAE (Mean Absolute Error): {mae:.2f}")
print(f"  R2 Score (Coefficient of Determination): {r2:.2f}")

print("\nExplanation of Performance Metrics:")
print("The metrics above provide insights into the performance of the model:")
print("- RMSE (Root Mean Squared Error): Indicates the average magnitude of the prediction errors. A lower RMSE suggests better model performance, but it can be influenced by outliers.")
print("- MAE (Mean Absolute Error): Measures the average absolute errors in the predictions. It provides a more intuitive understanding of the model's accuracy, with lower values indicating better performance.")
print("- R2 Score (Coefficient of Determination): Represents the proportion of variance in the dependent variable that is predictable from the independent variables. A higher R2 indicates a better fit of the model to the data.")

print("\nNote on Model Performance:")
print("Predicting movie box office success is inherently complex due to several factors:")
print("- Market Dynamics: The movie industry is influenced by various unpredictable elements such as audience preferences, competition, and marketing strategies.")
print("- Data Limitations: The dataset may not capture all relevant factors that contribute to a movie's success, such as social media buzz, critical reviews, and regional variations.")
print("- Feature Limitations: Despite including key features, the model may not fully account for the nuanced impact of each feature on box office performance.")
print("The high error figures reflect these challenges and uncertainties. While the model provides estimates, they should be interpreted with caution, acknowledging the unpredictability of the movie industry.")

# Feature importance
importances = model.feature_importances_
feature_importances = dict(zip(features, importances))
print("\nFeature Importances:")
for feature, importance in feature_importances.items():
    print(f"{feature}: {importance:.4f}")

# Cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print("\nCross-Validation RMSE Scores:", rmse_scores)
print("Mean RMSE from Cross-Validation:", rmse_scores.mean())

# Interactive segment
def predict_box_office(cast, director, writer, budget, rating, run_time):
    # Encoding categorical variables
    cast_encoded = le_cast.transform([cast])[0]
    director_encoded = le_director.transform([director])[0]
    writer_encoded = le_writer.transform([writer])[0]
    
    # Create a DataFrame for prediction
    input_data = pd.DataFrame([[cast_encoded, director_encoded, writer_encoded, budget, rating, run_time]],
                              columns=features)
    
    # Scale the input data
    input_data_scaled = scaler.transform(input_data)
    
    # Predict
    return model.predict(input_data_scaled)[0]

while True:
    print("\nWelcome to Box Office Predictor!")
    print("Enter details below to predict box office earnings or type 'quit' to exit.")
    
    cast = input("Enter Lead Cast: ")
    if cast.lower() == 'quit':
        break
    
    director = input("Enter Director: ")
    if director.lower() == 'quit':
        break
    
    writer = input("Enter Writer: ")
    if writer.lower() == 'quit':
        break
    
    try:
        budget = float(input("Enter Budget ($): "))
        rating = float(input("Enter Movie Rating: "))
        run_time = int(input("Enter Runtime (minutes): "))
        
        # Predict box office earnings
        estimated_earnings = predict_box_office(cast, director, writer, budget, rating, run_time)
        
        print(f"\nEstimated Box Office Earnings: ${estimated_earnings:.2f}")
        
    except ValueError as ve:
        print(f"Error: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")



Model Performance:
  RMSE (Root Mean Squared Error): 186596454.68
  MAE (Mean Absolute Error): 107876851.32
  R2 Score (Coefficient of Determination): 0.36

Explanation of Performance Metrics:
The metrics above provide insights into the performance of the model:
- RMSE (Root Mean Squared Error): Indicates the average magnitude of the prediction errors. A lower RMSE suggests better model performance, but it can be influenced by outliers.
- MAE (Mean Absolute Error): Measures the average absolute errors in the predictions. It provides a more intuitive understanding of the model's accuracy, with lower values indicating better performance.
- R2 Score (Coefficient of Determination): Represents the proportion of variance in the dependent variable that is predictable from the independent variables. A higher R2 indicates a better fit of the model to the data.

Note on Model Performance:
Predicting movie box office success is inherently complex due to several factors:
- Market Dynamics: The mov

Enter Lead Cast:  Sylvester Stallone
Enter Director:  John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  5
Enter Runtime (minutes):  120



Estimated Box Office Earnings: $83375084.48

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:    Sylvester Stallone
Enter Director:   John G. Avildsen
Enter Writer:    Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  10
Enter Runtime (minutes):  120


Error: y contains previously unseen labels: '  Sylvester Stallone'

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Sylvester Stallone
Enter Director:   John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  4
Enter Runtime (minutes):  12


Error: y contains previously unseen labels: ' John G. Avildsen'

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Toni Collette
Enter Director:  Adam Elliot
Enter Writer:  Adam Elliot
Enter Budget ($):  8200000
Enter Movie Rating:  10
Enter Runtime (minutes):  120



Estimated Box Office Earnings: $170302131.17

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Sylvester Stallone
Enter Director:  John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  10
Enter Runtime (minutes):  120



Estimated Box Office Earnings: $47077870.64

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Sylvester Stallone
Enter Director:  John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  7
Enter Runtime (minutes):  120



Estimated Box Office Earnings: $83375084.48

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Sylvester Stallone
Enter Director:  John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  1
Enter Runtime (minutes):  120



Estimated Box Office Earnings: $83375084.48

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Sylvester Stallone
Enter Director:  John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  0
Enter Runtime (minutes):  300



Estimated Box Office Earnings: $282928188.08

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.


Enter Lead Cast:  Sylvester Stallone
Enter Director:  John G. Avildsen
Enter Writer:  Sylvester Stallone
Enter Budget ($):  960000
Enter Movie Rating:  3
Enter Runtime (minutes):  3



Estimated Box Office Earnings: $69859874.41

Welcome to Box Office Predictor!
Enter details below to predict box office earnings or type 'quit' to exit.
