In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import classification_report

In [2]:
df=pd.read_csv('E-Commerce.csv')
df

Unnamed: 0,CustomerID,Gender,InvoiceDate,InvoiceNumber,ProductID,Quantity,Price,Total,OrderStatus,Country,TrafficSource,SessionDuration,DeviceCategory,Device,OS,DeliveryRating,ProductRating,Sales
0,9691,Female,1/01/2019,26088332,32,3,91.37,274.11,Completed,India,Social Media,6.54,Computer,Laptop,Windows,2,5,274.11
1,8327,Female,1/01/2019,95183269,34,1,48.24,48.24,In Process,India,Social Media,5.01,Computer,Desktop,Windows,0,0,0.00
2,6801,Male,1/01/2019,44769684,64,2,35.23,70.46,Completed,United States,Paid Advertisment,0.10,Computer,Desktop,Windows,1,2,70.46
3,4406,Male,1/02/2019,12422351,41,2,32.33,64.66,Completed,Philippines,Social Media,1.95,Mobile,Tablet,Windows,4,4,64.66
4,1966,Male,1/03/2019,84352310,20,1,81.89,81.89,Completed,Brazil,Organic Search,9.15,Computer,Laptop,Windows,5,3,81.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3594,1130,Male,12/29/2023,71920492,75,3,32.34,97.02,In Process,Germany,Organic Search,9.42,Computer,Desktop,Mac,0,0,0.00
3595,8329,Male,12/30/2023,60510134,2,1,33.11,33.11,Completed,United States,Social Media,9.78,Computer,Laptop,Windows,4,5,33.11
3596,8023,Male,12/31/2023,52697896,2,1,33.11,33.11,Completed,India,Organic Search,9.80,Computer,Laptop,Windows,3,5,33.11
3597,8307,Male,12/31/2023,61266739,55,2,73.53,147.06,In Process,South Korea,Paid Advertisment,6.53,Computer,Laptop,Windows,0,0,0.00


## Data cleaning and checking

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3599 entries, 0 to 3598
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       3599 non-null   int64  
 1   Gender           3599 non-null   object 
 2   InvoiceDate      3599 non-null   object 
 3   InvoiceNumber    3599 non-null   int64  
 4   ProductID        3599 non-null   int64  
 5   Quantity         3599 non-null   int64  
 6   Price            3599 non-null   float64
 7   Total            3599 non-null   float64
 8   OrderStatus      3599 non-null   object 
 9   Country          3599 non-null   object 
 10  TrafficSource    3599 non-null   object 
 11  SessionDuration  3599 non-null   float64
 12  DeviceCategory   3599 non-null   object 
 13  Device           3599 non-null   object 
 14  OS               3599 non-null   object 
 15  DeliveryRating   3599 non-null   int64  
 16  ProductRating    3599 non-null   int64  
 17  Sales         

In [4]:
df.describe()

Unnamed: 0,CustomerID,InvoiceNumber,ProductID,Quantity,Price,Total,SessionDuration,DeliveryRating,ProductRating,Sales
count,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0
mean,5526.962767,55821170.0,50.864962,2.000556,48.714571,97.394143,4.992101,2.641011,2.653515,61.185285
std,2586.425565,25681690.0,29.035862,0.813313,29.499165,74.421471,2.896605,2.181087,2.179732,75.880928
min,1111.0,11140550.0,1.0,1.0,1.22,1.22,0.01,0.0,0.0,0.0
25%,3288.0,33556990.0,26.0,1.0,26.03,37.61,2.49,0.0,0.0,0.0
50%,5546.0,56142930.0,52.0,2.0,43.5,81.46,4.93,3.0,4.0,31.26
75%,7777.0,78101600.0,76.0,3.0,74.55,147.06,7.515,5.0,5.0,98.04
max,9998.0,99963540.0,100.0,3.0,99.72,299.16,10.0,5.0,5.0,299.16


In [5]:
df.columns

Index(['CustomerID', 'Gender', 'InvoiceDate', 'InvoiceNumber', 'ProductID',
       'Quantity', 'Price', 'Total', 'OrderStatus', 'Country', 'TrafficSource',
       'SessionDuration', 'DeviceCategory', 'Device', 'OS', 'DeliveryRating',
       'ProductRating', 'Sales'],
      dtype='object')

In [6]:
df.isnull().sum()

CustomerID         0
Gender             0
InvoiceDate        0
InvoiceNumber      0
ProductID          0
Quantity           0
Price              0
Total              0
OrderStatus        0
Country            0
TrafficSource      0
SessionDuration    0
DeviceCategory     0
Device             0
OS                 0
DeliveryRating     0
ProductRating      0
Sales              0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [9]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

In [10]:
df.OrderStatus.unique()

array(['Completed', 'In Process', 'Returned', 'Cancelled'], dtype=object)

## Identifying the dependent and independet variabes
#### Don’t drop features based on assumption. Run:
#### Crosstab + percentage
#### Feature importance check
#### (Optional) Statistical tests

In [11]:
x=df.drop('OrderStatus',axis=1)
y=df['OrderStatus']

##### CustomerID - Unique to each customer
##### InvoiceNumber — unique to each order
##### ProductID — may or may not help (depends on modelling strategy)

In [12]:
x.drop(['CustomerID','InvoiceNumber','ProductID', 'Gender','Device','OS'],axis=1, inplace=True)
x.columns

Index(['InvoiceDate', 'Quantity', 'Price', 'Total', 'Country', 'TrafficSource',
       'SessionDuration', 'DeviceCategory', 'DeliveryRating', 'ProductRating',
       'Sales'],
      dtype='object')

#### Gender
#####  Based on this data, Gender does not have a significant effect on OrderStatus.

In [13]:
g=pd.crosstab(df['Gender'], df['OrderStatus'], normalize='index')*100
g.round(0).astype(int)

OrderStatus,Cancelled,Completed,In Process,Returned
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,13,61,19,6
Male,12,63,18,7


#### TrafficSource	
##### These percentage differences are minor, suggesting that TrafficSource has very little or no significant effect on OrderStatus.

In [14]:
t=pd.crosstab(df['TrafficSource'], df['OrderStatus'],normalize='index')*100
t.round(0).astype(int)

OrderStatus,Cancelled,Completed,In Process,Returned
TrafficSource,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Organic Search,12,62,19,6
Paid Advertisment,14,66,15,6
Social Media,13,61,19,7


##### after traing the data and checking the feature importances i have removed and then again trained the model to make my model efficent 

In [15]:
x = x.drop(['DeviceCategory', 'Quantity','TrafficSource'], axis=1)

In [16]:
x = x.drop('Country', axis=1)

In [17]:
x=x.drop('Price', axis=1)


In [18]:
x=x.drop('InvoiceDate', axis=1)


In [19]:
x.columns

Index(['Total', 'SessionDuration', 'DeliveryRating', 'ProductRating', 'Sales'], dtype='object')

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(x)


### Stratifies Kfold

In [21]:
def get_score(model,x_train, x_test, y_train, y_test):
    model.fit(x_train,y_train)
    return model.score(x_test,y_test)

In [22]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decision Tree': tree.DecisionTreeClassifier()
}
model_scores = {name: [] for name in models}


In [23]:
from sklearn.model_selection import StratifiedKFold
skf= StratifiedKFold(n_splits=4)
for train_index , test_index in skf.split(X,y):
    x_train, x_test,y_train, y_test = X[train_index], X[test_index], y.iloc[train_index], y.iloc[test_index]
    for name, model in models.items():
        model.fit(x_train, y_train)
        score = model.score(x_test, y_test)
        model_scores[name].append(score)
        y_pred=model.predict(x_test)   
        print(f"\n{name} Classification Report:")
        print(classification_report(y_test, y_pred, zero_division=0))



Logistic Regression Classification Report:
              precision    recall  f1-score   support

   Cancelled       0.00      0.00      0.00       114
   Completed       1.00      1.00      1.00       562
  In Process       0.49      1.00      0.66       166
    Returned       0.00      0.00      0.00        58

    accuracy                           0.81       900
   macro avg       0.37      0.50      0.41       900
weighted avg       0.72      0.81      0.75       900


Random Forest Classification Report:
              precision    recall  f1-score   support

   Cancelled       0.28      0.27      0.28       114
   Completed       1.00      1.00      1.00       562
  In Process       0.43      0.50      0.46       166
    Returned       0.19      0.12      0.15        58

    accuracy                           0.76       900
   macro avg       0.48      0.47      0.47       900
weighted avg       0.75      0.76      0.75       900


SVM Classification Report:
              precis

In [24]:
result_df=pd.DataFrame(model_scores)
result_df


Unnamed: 0,Logistic Regression,Random Forest,SVM,Decision Tree
0,0.808889,0.758889,0.808889,0.752222
1,0.808889,0.754444,0.808889,0.743333
2,0.807778,0.76,0.807778,0.757778
3,0.808676,0.770857,0.808676,0.770857


#### Logistic & SVM: 🎯 Only predict Completed and In Process. Ignore others. ❌
#### Random Forest: 👍 Predicts all classes, decent recall & macro F1. ✅
#### Decision Tree: Close to Random Forest, does better on Returned. ✅

In [25]:
final_model = RandomForestClassifier( class_weight='balanced')
final_model.fit(x_train, y_train)

In [26]:
import joblib
joblib.dump(final_model, 'final_rf_model.pkl')

['final_rf_model.pkl']

##### It was the work done before the stratified k fold which was done, and the scores were not quite good, so to compare them after words we have done it, and we have gotten the best model and saved it.  

In [27]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train, y_test = train_test_split(X,y,test_size=0.3)

In [28]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)


In [29]:
rf.score(x_test,y_test)

0.7861111111111111

In [30]:
svm = SVC()
svm.fit(x_train,y_train)

In [31]:
svm.score(x_test,y_test)

0.8203703703703704

In [32]:
lr = LogisticRegression(max_iter=1000,class_weight='balanced')
lr.fit(x_train,y_train)

In [33]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train,y_train)

In [34]:
lr.score(x_test,y_test)

0.7527777777777778

In [35]:
dt.score(x_test,y_test)

0.7685185185185185

### Feature Importance to eliminate irelevant variables

In [36]:
pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)

DeliveryRating     0.230013
SessionDuration    0.223017
Total              0.198160
Sales              0.174814
ProductRating      0.173995
dtype: float64