In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv("shop_smart_ecommerce.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [12]:
X = df.drop(columns = ['Revenue'])
y = df['Revenue'].astype(int)

y

0        0
1        0
2        0
3        0
4        0
        ..
12325    0
12326    0
12327    0
12328    0
12329    0
Name: Revenue, Length: 12330, dtype: int64

In [16]:
num_feature = X.select_dtypes(include = ['int64','float64']).columns
cat_feature = X.select_dtypes(include = ['object','category']).columns

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y , test_size=0.2, random_state=42 , stratify = y
)


In [22]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), num_feature),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_feature)
    ]
)



In [28]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    max_depth=6,              # prevents deep overfitting
    min_samples_leaf=30,      # smooths decision boundaries
    class_weight="balanced",  # handles imbalance
    random_state=42
)

In [29]:
pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", dt)
    ]
)

In [30]:
pipe.fit(X_train, y_train)

In [33]:
y_pred = pipe.predict(X_test)


#   f1_score, classification_report, confusion_matrix

print("f1_score = ",f1_score(y_pred,y_test))
print("confusion_matrix = \n",confusion_matrix(y_pred,y_test))
print("classification_report = ",classification_report(y_pred,y_test))


f1_score =  0.6278381046396841
confusion_matrix = 
 [[1771   64]
 [ 313  318]]
classification_report =                precision    recall  f1-score   support

           0       0.85      0.97      0.90      1835
           1       0.83      0.50      0.63       631

    accuracy                           0.85      2466
   macro avg       0.84      0.73      0.77      2466
weighted avg       0.85      0.85      0.83      2466



In [37]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__max_depth": [4, 6, 8],
    "model__min_samples_leaf": [20, 30, 50]
}

gride = GridSearchCV(
    pipe,
    param_grid,
    scoring = "f1",
    cv = 5,
    n_jobs=-1
)

gride.fit(X_train, y_train)

print("Best f1 : ",gride.best_score_)
print("Best param : ",gride.best_params_)

Best f1 :  0.6343735129725652
Best param :  {'model__max_depth': 4, 'model__min_samples_leaf': 50}
