In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Data Preprocessing:

In [71]:
# Load the data
data = pd.read_csv("customer_retail_data.csv")
data.head()

Unnamed: 0,Customer ID,First Name,Last Name,Age,Gender,Location,Product Category,Product,Price,Quantity,Purchase Date,Payment method,Order Status
0,1,Parker,Villarreal,56,Male,New Nancychester,Home Appliances,Microwave,759.96,9,12-05-22,PayPal,Delivered
1,2,Nancy,Randall,32,Male,Andreaside,Beauty,Face Cream,310.16,3,19-06-22,Google Pay,Shipped
2,3,Dennis,Patterson,61,Female,Stacybury,Books,Fiction,190.64,4,24-10-22,Debit Card,Delivered
3,4,Christopher,Peterson,19,Other,Port Tammystad,Home Appliances,Refrigerator,145.75,6,14-03-23,Credit Card,Delivered
4,5,Jessica,Hogan,63,Female,New Michael,Clothing,Shirt,998.5,7,04-08-22,Debit Card,Shipped


In [72]:
data.isnull().sum()

Customer ID         0
First Name          0
Last Name           0
Age                 0
Gender              0
Location            0
Product Category    0
Product             0
Price               0
Quantity            0
Purchase Date       0
Payment method      0
Order Status        0
dtype: int64

In [73]:
# Remove unnecessary columns
data.drop([ "Customer ID","First Name", "Last Name", "Location", "Purchase Date"], axis=1, inplace=True)

In [74]:
# Encode categorical variables
labelencoder = LabelEncoder()
data['Gender'] = labelencoder.fit_transform(data['Gender'])
data['Product Category'] = labelencoder.fit_transform(data['Product Category'])
data['Product'] = labelencoder.fit_transform(data['Product'])
data['Payment method'] = labelencoder.fit_transform(data['Payment method'])

In [75]:
data.head()

Unnamed: 0,Age,Gender,Product Category,Product,Price,Quantity,Payment method,Order Status
0,56,1,4,14,759.96,9,4,Delivered
1,32,1,0,5,310.16,3,3,Shipped
2,61,0,1,6,190.64,4,2,Delivered
3,19,2,4,19,145.75,6,1,Delivered
4,63,0,2,21,998.5,7,2,Shipped


In [76]:
# Split the data into training and testing sets
X = data.drop('Order Status',axis=1)
y = data['Order Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Building the Decision Tree Model:

In [77]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the model
clf = DecisionTreeClassifier(criterion="gini")

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.25004444444444446


### Interpretation of the Decision Tree:

In [78]:
# from sklearn.tree import plot_tree
# import matplotlib.pyplot as plt

# # Visualize the decision tree
# plt.figure(figsize=(15,10))
# plot_tree(clf, filled=True, feature_names=X.columns)
# plt.show()

### Evaluating the Performance of the Decision Tree Model:

In [79]:
from sklearn.metrics import classification_report, confusion_matrix

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

   Cancelled       0.25      0.25      0.25     22539
   Delivered       0.25      0.25      0.25     22637
    Returned       0.25      0.25      0.25     22429
     Shipped       0.25      0.25      0.25     22395

    accuracy                           0.25     90000
   macro avg       0.25      0.25      0.25     90000
weighted avg       0.25      0.25      0.25     90000

[[5671 5654 5646 5568]
 [5604 5649 5729 5655]
 [5555 5565 5645 5664]
 [5671 5650 5535 5539]]
