In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def score(x_train,x_test,y_train,y_test,y_pred,algo ):
    mse = mean_squared_error(y_test, y_pred)           
    mae = mean_absolute_error(y_test, y_pred)         
    r2 = r2_score(y_test, y_pred)                     
    train_score = round(algo.score(x_train, y_train), 2)
    test_score = round(algo.score(x_test, y_test), 2)
    print(f"Train score: {train_score}")
    print(f"Test score: {test_score}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"R-squared Score: {r2:.2f}")

In [None]:
df = pd.read_csv(r"C:\Users\jam\Desktop\ML\Commerce-Customer-Behavior-Sales-Analysis-Prediction\ecommerce_customer_behavior_dataset.csv")
df.sample(3)

In [53]:
df.columns

Index(['Age', 'Gender', 'City', 'Product_Category', 'Unit_Price', 'Quantity',
       'Discount_Amount', 'Total_Amount', 'Payment_Method', 'Device_Type',
       'Session_Duration_Minutes', 'Pages_Viewed', 'Is_Returning_Customer',
       'Delivery_Time_Days', 'Customer_Rating', 'Month', 'Weekend'],
      dtype='object')

In [None]:
df = df.drop(columns=["Order_ID", 'Customer_ID'])

In [None]:
df.shape

In [None]:
df = df.dropna()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def encode(feature):
    l = LabelEncoder()
    df[feature] = l.fit_transform(df[feature])
    print(df[feature].value_counts())

In [None]:
encode("City")

In [None]:
encode("Gender")

In [None]:
encode('Product_Category')

In [None]:
encode('Device_Type')

In [None]:
encode('Is_Returning_Customer')

In [None]:
encode("Payment_Method")

In [None]:
df["Date"]= pd.to_datetime(df["Date"],format='%Y-%m-%d')
df["Month"] = df["Date"].dt.month
df["Weekend"] = (df["Date"].dt.dayofweek).isin([5,6]).astype(int)

In [None]:
df = df.drop(columns=['Date'])

In [None]:
df.shape

In [None]:
df.info()

In [None]:
col = df.columns
colors = plt.cm.tab20.colors

fig,axs = plt.subplots(nrows=5,ncols=4,figsize=(20,15))
axs=axs.flatten()
for i, col in enumerate(col):
    axs[i].hist(df[col], bins=30, color=colors[i % len(colors)], edgecolor='black')
    axs[i].set_title(col)
    
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()


In [None]:
df.columns

In [None]:
import math

target = 'Customer_Rating'
numeric_cols = [col for col in df.columns if col != target]

n_cols = 4
n_rows = math.ceil(len(numeric_cols) / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows*4))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.boxplot(
        x=target,
        y=col,
        hue=target,        
        data=df,
        ax=axes[i],
        palette='Set3',
        dodge=False,       
        legend=False       
    )
    axes[i].set_title(f'{col} vs {target}')


plt.tight_layout()
plt.show()

In [None]:
target =[]
for i in df.columns :
    if i =='Customer_Rating':
        target.append(i) 


In [None]:
x = df.drop(target , axis=1)
y = df[target].values.ravel()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size= 0.3 , random_state=4)
print(x.shape)
print(y.shape)

In [None]:
s = StandardScaler()
x_train = s.fit_transform(x_train)
x_test = s.transform(x_test)
x_train[5,:]

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [54]:
p_grid = {"n_neighbors":np.arange(1,50)}
knn= KNeighborsClassifier()
knn_cv=GridSearchCV(knn,p_grid,cv=5)
knn_cv.fit(x_train,y_train)
print(knn_cv.best_params_)

{'n_neighbors': np.int64(41)}


In [55]:
knn = KNeighborsClassifier(n_neighbors=41)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)

In [56]:
score(x_train,x_test,y_train,y_test,y_pred,knn)

Train score: 0.41
Test score: 0.36
Mean Squared Error: 1.75
Mean Absolute Error: 0.94
R-squared Score: -0.45


In [None]:
xx=confusion_matrix(y_test,y_pred)

In [None]:
xx_dis = ConfusionMatrixDisplay(confusion_matrix=xx)
xx_dis.plot()
plt.show()

In [58]:
from sklearn.linear_model import LogisticRegression

In [59]:
lr = LogisticRegression(max_iter=1000,solver='lbfgs')
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)

In [60]:
score(x_train,x_test,y_train,y_test,y_pred,lr)

Train score: 0.38
Test score: 0.36
Mean Squared Error: 1.79
Mean Absolute Error: 0.94
R-squared Score: -0.48


In [None]:
from sklearn.tree import DecisionTreeClassifier,plot_tree

In [None]:
dtree= DecisionTreeClassifier(criterion="entropy",max_depth=4)
dtree.fit(x_train,y_train)

In [None]:
df.columns

In [None]:
col =['Age', 'Gender', 'City', 'Product_Category', 'Unit_Price', 'Quantity',
       'Discount_Amount', 'Total_Amount', 'Payment_Method', 'Device_Type',
       'Session_Duration_Minutes', 'Pages_Viewed', 'Is_Returning_Customer',
       'Delivery_Time_Days', 'Customer_Rating', 'Month', 'Weekend']
plot_tree(dtree,feature_names=col)
plt.show()

In [None]:
y_pred = dtree.predict(x_test)

In [None]:
xx=confusion_matrix(y_test,y_pred)
dis = ConfusionMatrixDisplay(xx)
dis.plot()
plt.show()

In [57]:
score(x_train,x_test,y_train,y_test,y_pred,dtree)

Train score: 0.39
Test score: 0.35
Mean Squared Error: 1.75
Mean Absolute Error: 0.94
R-squared Score: -0.45
