Load the Data

In [6]:
import pandas as pd
df = pd.read_csv("N:/Interships/Future Intern ML/Datasets/boxoffice.csv")

In [8]:
# basic information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2694 entries, 0 to 2693
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             2694 non-null   object
 1   domestic_revenue  2694 non-null   int64 
 2   world_revenue     2694 non-null   int64 
 3   distributor       2694 non-null   object
 4   opening_revenue   2694 non-null   int64 
 5   opening_theaters  2694 non-null   int64 
 6   budget            2694 non-null   int64 
 7   MPAA              2694 non-null   object
 8   genres            2694 non-null   object
 9   release_days      2694 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 210.6+ KB


In [10]:
# show first few rows
df.head()

Unnamed: 0,title,domestic_revenue,world_revenue,distributor,opening_revenue,opening_theaters,budget,MPAA,genres,release_days
0,The Avengers,6026491,1274885664,Warner Bros.,163620146,253,174687337,R,Animation,16
1,Titanic,169411543,1132871091,Disney,85549990,122,103948486,G,Action,103
2,Jurassic Park,107836098,583329845,Sony,55681429,3826,122104991,NC-17,Horror,89
3,Avatar,51433697,1225323391,Disney,109775324,3868,46431596,G,Horror,85
4,The Lion King,142791649,604140729,Warner Bros.,59476800,2934,203513696,R,Comedy,158


Data Cleaning

In [16]:
#pre-process the data
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
# Drop unnecessary columns
df = df.drop(columns=["title", "distributor"], errors="ignore")

# Fill missing values separately for categorical and numerical columns
for col in df.columns:
    if df[col].dtype == "object":  # Categorical columns
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill with most frequent value
    else:  # Numeric columns
        df[col] = df[col].fillna(df[col].mean())  # Fill with mean

# Convert categorical columns using Label Encoding
label_cols = ["MPAA", "genres"]
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Feature Engineering: Create ROI (Return on Investment)
df["ROI"] = (df["world_revenue"] - df["budget"]) / df["budget"]

# Apply log transformation to revenue and budget to reduce variance
df["world_revenue"] = np.log1p(df["world_revenue"])
df["budget"] = np.log1p(df["budget"])
df["ROI"] = np.log1p(df["ROI"])  # Log transform ROI too

# Remove Outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

Seperating dataset into features and labels

In [20]:
X = df.drop(columns=["world_revenue"], errors="ignore")
y = df["world_revenue"]
print(X, y)

      domestic_revenue  opening_revenue  opening_theaters     budget  MPAA  \
0              6026491        163620146               253  18.978508     4   
1            169411543         85549990               122  18.459406     0   
2            107836098         55681429              3826  18.620392     1   
3             51433697        109775324              3868  17.653491     0   
4            142791649         59476800              2934  19.131244     4   
...                ...              ...               ...        ...   ...   
2687          24564880        163608027              2929  18.322214     3   
2688          66982352         17715123               837  18.386734     3   
2690          63305093        190634982              3171  18.527184     3   
2691         271758510        112771730              2450  18.537260     1   
2692         268259149        194172443               661  18.964340     0   

      genres  release_days       ROI  
0          1            

Splitting data into Training & Testing Sets

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Training the model

In [32]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
# XGBoost Model
model = XGBRegressor(
    n_estimators=1000,  
    learning_rate=0.01,  
    max_depth=10,        
    subsample=0.9,      
    colsample_bytree=0.9, 
    reg_lambda=2,  
    random_state=42
)

model.fit(X_train, y_train)



Evaluation

In [34]:
y_pred = model.predict(X_test)
print("Model Accuracy (R² Score):", r2_score(y_test, y_pred))


Model Accuracy (R² Score): 0.9864075671834946
