In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the data
df = pd.read_csv(r'C:\Users\17472\Desktop\kat\Online Sales Data.csv')  # Provide the correct file path

# Check the column names
print(df.columns)

# Fill missing values
df['Units Sold'] = df['Units Sold'].fillna(df['Units Sold'].mean())
df['Product Category'] = df['Product Category'].fillna(df['Product Category'].mode()[0])

# Encode categorical features
le = LabelEncoder()
df['Region'] = le.fit_transform(df['Region'])

# Define features (X) and target variable (y) for prediction
X = df[['Units Sold', 'Unit Price']]  # Features
y = df['Total Revenue']  # Target variable for prediction

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the XGBoost model
xgb_model = xgb.XGBRegressor()

# Hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Perform GridSearchCV to find the best combination of parameters
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Display the best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Predict with the best model
y_pred_best = grid_search.best_estimator_.predict(X_test)

# Evaluate the optimized model using Mean Squared Error (MSE)
mse_best = mean_squared_error(y_test, y_pred_best)
print(f"Mean Squared Error (Best Model): {mse_best}")

# Save the best model for future use
joblib.dump(grid_search.best_estimator_, 'best_xgb_model.pkl')


Index(['Transaction ID', 'Date', 'Product Category', 'Product Name',
       'Units Sold', 'Unit Price', 'Total Revenue', 'Region',
       'Payment Method'],
      dtype='object')
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300}
Mean Squared Error (Best Model): 720.3196010721576


['best_xgb_model.pkl']