In [40]:
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
                                              0.0/380.1 kB ? eta -:--:--
     --                                       20.5/380.1 kB ? eta -:--:--
     -----------                            112.6/380.1 kB 1.3 MB/s eta 0:00:01
     -------------------                    194.6/380.1 kB 1.3 MB/s eta 0:00:01
     ------------------------               245.8/380.1 kB 1.5 MB/s eta 0:00:01
     -------------------------------------- 380.1/380.1 kB 1.7 MB/s eta 0:00:00
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
                                              0.0/233.4 kB ? eta -:--:--
     -------------------------------------- 233.4/233.4 kB 7.0 MB/s eta 0:00:00
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
                              

In [41]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


In [42]:
df = pd.read_csv('transformed_movie_data6.csv')

In [43]:
# Handle date column if it exists
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df = df.drop('Date', axis=1)

In [53]:
# Replace infinite or very large values with a finite value
finite_value = 1  # Choose a finite value appropriate for your dataset
df.replace([np.inf, -np.inf], finite_value, inplace=True)

In [54]:
# Prepare the data for modeling
X = df.drop('Opening', axis=1)
y = df['Opening']

In [55]:
# Ensure all data types are suitable for XGBoost
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
# Define objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 1.0),
        'enable_categorical': True,
    }

    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

In [65]:
# Create study and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)


[I 2024-05-17 18:30:08,506] A new study created in memory with name: no-name-fe303ae6-aba9-4c41-8f2a-a159d0171f57
[I 2024-05-17 18:30:08,629] Trial 0 finished with value: 0.12163071157318157 and parameters: {'n_estimators': 305, 'max_depth': 5, 'learning_rate': 0.014608333869675841, 'subsample': 0.6353816703506975, 'colsample_bytree': 0.8154894741547372, 'gamma': 0.3754501579700272, 'reg_alpha': 0.14174967992945275, 'reg_lambda': 0.5978016572612107}. Best is trial 0 with value: 0.12163071157318157.
[I 2024-05-17 18:30:08,684] Trial 1 finished with value: 0.10323855250512429 and parameters: {'n_estimators': 107, 'max_depth': 8, 'learning_rate': 0.0553980401733316, 'subsample': 0.7917041758232697, 'colsample_bytree': 0.6963473414591076, 'gamma': 0.18747377466719883, 'reg_alpha': 0.2511140970998818, 'reg_lambda': 0.6873266419421765}. Best is trial 1 with value: 0.10323855250512429.
[I 2024-05-17 18:30:08,778] Trial 2 finished with value: 0.10763142006530545 and parameters: {'n_estimators'