Import all necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load dataset
df = pd.read_csv("/sales prediction.csv")

Data Exploration

In [None]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9800 non-null   int64  
 1   Order ID       9800 non-null   object 
 2   Order Date     9800 non-null   object 
 3   Ship Date      9800 non-null   object 
 4   Ship Mode      9800 non-null   object 
 5   Customer ID    9800 non-null   object 
 6   Customer Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal Code    9789 non-null   float64
 12  Region         9800 non-null   object 
 13  Product ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub-Category   9800 non-null   object 
 16  Product Name   9800 non-null   object 
 17  Sales          9800 non-null   float64
dtypes: float

Data Preprocessing

In [None]:
# Convert dates
df['Order Date'] = pd.to_datetime(df['Order Date'], dayfirst=True, errors='coerce')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], dayfirst=True, errors='coerce')

# Drop rows where dates couldn't be parsed
df = df.dropna(subset=['Order Date', 'Ship Date'])

# Create new date features
df['Shipping Days'] = (df['Ship Date'] - df['Order Date']).dt.days
df['Order_Year'] = df['Order Date'].dt.year
df['Order_Month'] = df['Order Date'].dt.month
df['Order_Day'] = df['Order Date'].dt.day
df['Ship_Year'] = df['Ship Date'].dt.year
df['Ship_Month'] = df['Ship Date'].dt.month
df['Ship_Day'] = df['Ship Date'].dt.day

# Remove datetime columns
df = df.drop(columns=['Order Date', 'Ship Date'])


# Drop unnecessary columns
df = df.drop(columns=['Row ID', 'Order ID', 'Customer ID', 'Customer Name',
                      'Product ID', 'Product Name'])

# One-hot encode categorical columns
df = pd.get_dummies(df, drop_first=True)
X = df.drop(columns=['Sales'])
y = df['Sales']

Data splitting and fitting into model

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

In [None]:
xgb_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

Prediction and Evaluation

In [None]:
y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost MSE: {mse:.2f}")
print(f"XGBoost R2 Score: {r2:.2f}")

XGBoost MSE: 629500.43
XGBoost R2 Score: 0.06


In [None]:
# Cross-validation score
cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='r2')
print(f"XGBoost 5-fold CV R2: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

XGBoost 5-fold CV R2: 0.04 ± 0.06


User centric function(customized input)

In [None]:
def predict_sales_from_user():
    # Example user inputs
    shipping_days = int(input("Enter Shipping Days: "))
    order_year = int(input("Enter Order Year (e.g. 2018): "))
    order_month = int(input("Enter Order Month (1-12): "))
    order_day = int(input("Enter Order Day (1-31): "))
    ship_year = int(input("Enter Ship Year (e.g. 2018): "))
    ship_month = int(input("Enter Ship Month (1-12): "))
    ship_day = int(input("Enter Ship Day (1-31): "))

    # Categorical dummy inputs — example values
    ship_mode = input("Enter Ship Mode (First Class / Same Day / Second Class / Standard Class): ")
    segment = input("Enter Segment (Consumer / Corporate / Home Office): ")
    country = 'United States'  # Assuming constant
    region = input("Enter Region (Central / East / South / West): ")
    category = input("Enter Category (Furniture / Office Supplies / Technology): ")
    sub_category = input("Enter Sub-Category (e.g. Chairs / Phones / Binders): ")

    # Build dataframe
    user_data = {
        'Shipping Days': shipping_days,
        'Order_Year': order_year,
        'Order_Month': order_month,
        'Order_Day': order_day,
        'Ship_Year': ship_year,
        'Ship_Month': ship_month,
        'Ship_Day': ship_day
    }

    # Create dummy columns for categorical
    for col in X.columns:
        if col not in user_data:
            user_data[col] = 0  # Default 0

    # Set correct dummy variables
    if f'Ship Mode_{ship_mode}' in X.columns:
        user_data[f'Ship Mode_{ship_mode}'] = 1
    if f'Segment_{segment}' in X.columns:
        user_data[f'Segment_{segment}'] = 1
    if f'Region_{region}' in X.columns:
        user_data[f'Region_{region}'] = 1
    if f'Category_{category}' in X.columns:
        user_data[f'Category_{category}'] = 1
    if f'Sub-Category_{sub_category}' in X.columns:
        user_data[f'Sub-Category_{sub_category}'] = 1

    # Convert to DataFrame
    user_df = pd.DataFrame([user_data])
    user_df = user_df[X.columns]  # Ensure same column order

    # Predict
    predicted_sales = xgb_model.predict(user_df)[0]
    print(f"Predicted Sales: ${predicted_sales:.2f}")

# Call the function to test
predict_sales_from_user()

Enter Shipping Days: 5
Enter Order Year (e.g. 2018): 2025
Enter Order Month (1-12): 2
Enter Order Day (1-31): 3
Enter Ship Year (e.g. 2018): 2025
Enter Ship Month (1-12): 8
Enter Ship Day (1-31): 8
Enter Ship Mode (First Class / Same Day / Second Class / Standard Class): First Class
Enter Segment (Consumer / Corporate / Home Office): Consumer
Enter Region (Central / East / South / West): East
Enter Category (Furniture / Office Supplies / Technology): Technology
Enter Sub-Category (e.g. Chairs / Phones / Binders): Phones
✅ Predicted Sales: $331.49
