In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the Dataset
df = pd.read_excel('data/Coffee Shop Sales.xlsx')

# Step 2: Data Preprocessing
# Calculate Total Sales as (transaction_qty * unit_price)
df['Total Sales'] = df['transaction_qty'] * df['unit_price']

# Drop any unnecessary columns
df = df.drop(['transaction_id', 'transaction_date', 'transaction_time', 'product_detail'], axis=1)

# Convert categorical variables into dummy/indicator variables
df = pd.get_dummies(df, drop_first=True)

# Define feature set X and target y
X = df.drop(['Total Sales'], axis=1)  # Features
y = df['Total Sales']  # Target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Model Training and Evaluation

# Dictionary to store RMSE and R-squared for each model
model_results = {}

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
model_results['Linear Regression'] = {'RMSE': rmse_lr, 'R^2': r2_lr}

# Decision Tree
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)
model_results['Decision Tree'] = {'RMSE': rmse_dt, 'R^2': r2_dt}

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
model_results['Random Forest'] = {'RMSE': rmse_rf, 'R^2': r2_rf}

# XGBoost
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)
model_results['XGBoost'] = {'RMSE': rmse_xgb, 'R^2': r2_xgb}

# Step 4: Display Model Results
for model_name, metrics in model_results.items():
    print(f"{model_name}: RMSE = {metrics['RMSE']}, R^2 = {metrics['R^2']}")


Linear Regression: RMSE = 0.8039139882505983, R^2 = 0.9293120576622209
Decision Tree: RMSE = 3.2531526078706373e-14, R^2 = 1.0
Random Forest: RMSE = 0.0032960178861133876, R^2 = 0.9999988117569347
XGBoost: RMSE = 0.0001283576835521317, R^2 = 0.9999999981979373


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import optuna

import warnings
warnings.filterwarnings("ignore")

# Step 1: Load the Dataset
df = pd.read_excel('data/Coffee Shop Sales.xlsx')

# Step 2: Data Preprocessing
# Calculate Total Sales as (transaction_qty * unit_price)
df['Total Sales'] = df['transaction_qty'] * df['unit_price']

# Drop unnecessary columns
df = df.drop(['transaction_id', 'transaction_date', 'transaction_time', 'product_detail'], axis=1)

# Convert categorical variables into dummy/indicator variables
df = pd.get_dummies(df, drop_first=True)

# Define feature set X and target y
X = df.drop(['Total Sales'], axis=1)  # Features
y = df['Total Sales']  # Target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define the objective functions for Random Forest and XGBoost with Optuna

def rf_objective(trial):
    """Objective function for Random Forest optimization with Optuna."""
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Cross-validation
    cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return np.mean(cv_scores)

def xgb_objective(trial):
    """Objective function for XGBoost optimization with Optuna."""
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.3, 1.0)
    
    xgb = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        random_state=42
    )
    
    # Cross-validation
    cv_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return np.mean(cv_scores)

# Step 4: Optimize Random Forest
print("Optimizing Random Forest...")
rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(rf_objective, n_trials=50)

# Step 5: Optimize XGBoost
print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=50)

# Step 6: Train the best models using the optimized hyperparameters

# Random Forest
best_rf_params = rf_study.best_params
print("Best Random Forest Params: ", best_rf_params)
rf_model = RandomForestRegressor(**best_rf_params, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# XGBoost
best_xgb_params = xgb_study.best_params
print("Best XGBoost Params: ", best_xgb_params)
xgb_model = XGBRegressor(**best_xgb_params, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Step 7: Evaluate the models with the optimized parameters
print("\nModel Performance After Optimization:\n")

# Random Forest
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - RMSE: {rmse_rf}, R²: {r2_rf}")

# XGBoost
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost - RMSE: {rmse_xgb}, R²: {r2_xgb}")


[I 2024-10-07 12:28:39,462] A new study created in memory with name: no-name-56a553fa-d52f-478f-a7bf-473029dcde79


Optimizing Random Forest...


[I 2024-10-07 12:29:38,358] Trial 0 finished with value: -0.014424981676025166 and parameters: {'n_estimators': 254, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: -0.014424981676025166.
[I 2024-10-07 12:31:32,516] Trial 1 finished with value: -0.03273287443243654 and parameters: {'n_estimators': 500, 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: -0.014424981676025166.
[I 2024-10-07 12:32:38,114] Trial 2 finished with value: -0.013531179723020698 and parameters: {'n_estimators': 286, 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 2 with value: -0.013531179723020698.
[I 2024-10-07 12:34:29,192] Trial 3 finished with value: -0.004655442846965939 and parameters: {'n_estimators': 486, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 3 with value: -0.004655442846965939.
[I 2024-10-07 12:35:36,614] Trial 4 finished with value: -0.12891494