In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [5]:
df = pd.read_csv('data.csv')

In [6]:
# Encode categorical features
df_encoded = df.copy()
label_cols = ['Product', 'Category', 'Customer Location', 'Payment Method']
for col in label_cols:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

X = df_encoded.drop(['Order ID', 'Date', 'Customer Name', 'Status'], axis=1)
y = df_encoded['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.34


In [7]:
df['Status'].value_counts(normalize=True)

Status
Completed    0.352
Pending      0.340
Cancelled    0.308
Name: proportion, dtype: float64

## Add new features

In [8]:
df['Date'].head()

0    14-03-25
1    20-03-25
2    15-02-25
3    19-02-25
4    10-03-25
Name: Date, dtype: object

In [9]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')  # if your format is like 12-04-2023

In [10]:
# Feature: Sales per item
df['Sales Per Item'] = df['Total Sales'] / df['Quantity']

# Feature: Day of week
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Weekday'] = df['Date'].dt.day_name()

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
label_cols = ['Product', 'Category', 'Customer Location', 'Payment Method', 'Weekday']
for col in label_cols:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Target encoding
df_encoded['Status_encoded'] = LabelEncoder().fit_transform(df_encoded['Status'])

# Features and target
X = df_encoded[['Price', 'Quantity', 'Sales Per Item', 'Product', 'Category',
                'Customer Location', 'Payment Method', 'Weekday']]
y = df_encoded['Status_encoded']


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", clf.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.32
[[8 6 2]
 [5 3 9]
 [5 7 5]]
              precision    recall  f1-score   support

           0       0.44      0.50      0.47        16
           1       0.19      0.18      0.18        17
           2       0.31      0.29      0.30        17

    accuracy                           0.32        50
   macro avg       0.31      0.32      0.32        50
weighted avg       0.31      0.32      0.32        50



In [12]:


model = XGBClassifier(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.31      0.31      0.31        16
           1       0.12      0.12      0.12        17
           2       0.39      0.41      0.40        17

    accuracy                           0.28        50
   macro avg       0.28      0.28      0.28        50
weighted avg       0.27      0.28      0.28        50

