In [1]:
# outline the process of building a predictive model to forecast the box office success of movies using regression techniques.
# Here's a step-by-step guide:

# 1. Data Understanding and Preparation:
# Explore the Data: Understand the features and their distributions, check for missing values, and identify any outliers.
# Feature Selection: Identify relevant features for predicting box office success.
# Features like genre, budget, release date, director, etc., seem pertinent.
# Data Cleaning: Handle missing values, outliers, and inconsistencies in the dataset.
# Feature Engineering: Create new features if necessary, like extracting month or season from release date.

# 2. Model Selection:
# Choose regression algorithms suitable for the task. For example, Linear Regression, Random Forest Regression, or Gradient Boosting Regression.
# Split the data into training and testing sets to evaluate the model's performance.

# 3. Model Training:
# Train the chosen regression models on the training dataset using selected features.
# Perform cross-validation to tune hyperparameters if necessary, to avoid overfitting.

# 4. Model deployment:
# Run the model and see performance with sample movies

# 5. Model Interpretation and Deployment::
# save the model so in future can do the following :
# model evaluation :using appropriate metrics such as Mean Squared Error (MSE), Root Mean Squared Error (RMSE), or R-squared (R2).
# Compare the performance of different models to select the best-performing one.
# Interpret the results to understand which features have the most significant impact on box office success.
# Deploy the chosen model for making predictions on new data

In [46]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import pickle

In [3]:
# load Data
from google.colab import files
uploaded = files.upload()

Saving movies.csv to movies.csv


In [4]:
# Load the dataset
data = pd.read_csv('movies.csv')

In [7]:
# view the data
data.head(3)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0


In [9]:
# understand the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


In [29]:
# continue exploring data
data.describe()

Unnamed: 0,year,score,votes,budget,gross,runtime
count,7668.0,7665.0,7665.0,5497.0,7479.0,7664.0
mean,2000.405451,6.390411,88108.5,35589880.0,78500540.0,107.261613
std,11.153508,0.968842,163323.8,41457300.0,165725100.0,18.581247
min,1980.0,1.9,7.0,3000.0,309.0,55.0
25%,1991.0,5.8,9100.0,10000000.0,4532056.0,95.0
50%,2000.0,6.5,33000.0,20500000.0,20205760.0,104.0
75%,2010.0,7.1,93000.0,45000000.0,76016690.0,116.0
max,2020.0,9.3,2400000.0,356000000.0,2847246000.0,366.0


In [5]:
# Feature selection
features = ['genre', 'budget', 'director', 'runtime']

In [11]:
# Data preprocessing
X = pd.get_dummies(data[features])  # One-hot encoding for categorical variables
y = data['gross']

In [13]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Check for missing values in the target variable
missing_values_indices_y = y_train.index[y_train.isnull()]
if not missing_values_indices_y.empty:
    X_train_imputed = X_train.drop(missing_values_indices_y)
    y_train_imputed = y_train.drop(missing_values_indices_y)
else:
    X_train_imputed = X_train
    y_train_imputed = y_train

# Check for missing values in the imputed target variable
missing_values_indices_y_imputed = y_train_imputed.index[y_train_imputed.isnull()]
if not missing_values_indices_y_imputed.empty:
    X_train_imputed = X_train_imputed.drop(missing_values_indices_y_imputed)
    y_train_imputed = y_train_imputed.drop(missing_values_indices_y_imputed)


In [26]:
# Handle missing values in features
feature_imputer = SimpleImputer(strategy='mean')
X_train_imputed = feature_imputer.fit_transform(X_train_imputed)
X_test_imputed = feature_imputer.transform(X_test)


In [27]:
# Model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train_imputed, y_train_imputed)

In [37]:
# Choose a few sample movies for prediction
sample_movies = X_test_imputed[:3]  # Selecting the first three movies from the test set for demonstration

# Make predictions for the sample movies
sample_predictions = model.predict(sample_movies)

# Display the sample movie features and their predicted gross earnings
# Print sample movie predictions
# Print sample movie predictions
# Print sample movie predictions
# Print sample movie predictions
# Print sample movie predictions
# Print sample movie predictions
for i, movie_features in enumerate(sample_movies):
    print(f"Sample Movie {i+1}:")
    print("Features:")
    # Truncate feature names and display only relevant information
    movie_features_shortened = {features[j][:30]: movie_features[j] for j in range(len(features))}
    print(pd.DataFrame([movie_features_shortened]).to_string(index=False))  # Displaying the shortened features of the sample movie without row index
    # Displaying the predicted gross earnings in a concise format
    print(f"\nPredicted Gross Earnings: ${sample_predictions[i]:,.2f}\n")  # Displaying the predicted gross earnings with commas for thousands separator and two decimal places


Sample Movie 1:
Features:
    genre  budget  director  runtime
6900000.0   112.0       0.0      0.0

Predicted Gross Earnings: $16,610,288.88

Sample Movie 2:
Features:
       genre  budget  director  runtime
3.608534e+07   112.0       0.0      0.0

Predicted Gross Earnings: $1,006,633.46

Sample Movie 3:
Features:
       genre  budget  director  runtime
3.608534e+07    96.0       0.0      0.0

Predicted Gross Earnings: $1,668,205.55



In [38]:
# the print out would require director name, runtime and movie name can be added . The Main aim here was to do the model
# Save the model using pickle
with open('movie_box_office_model.pkl', 'wb') as f:
    pickle.dump(model, f)
