In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Clickstream_customer_analysis\data\train_data (1).csv") 

print("Shape of the dataset:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nStatistical Summary:\n", df.describe())


Shape of the dataset: (132379, 14)

Columns: ['year', 'month', 'day', 'order', 'country', 'session_id', 'page1_main_category', 'page2_clothing_model', 'colour', 'location', 'model_photography', 'price', 'price_2', 'page']

First 5 rows:
    year  month  day  order  country  session_id  page1_main_category  \
0  2008      6   22     21       29       15648                    3   
1  2008      5   19      6       29       10018                    2   
2  2008      7   15      2       29       19388                    3   
3  2008      5    2      2       29        7181                    2   
4  2008      6    9     16       29       13493                    2   

  page2_clothing_model  colour  location  model_photography  price  price_2  \
0                  C20      13         1                  2     48        1   
1                  B26      13         3                  1     57        1   
2                  C13       9         5                  1     48        1   
3            

In [2]:
required_columns = ['converted', 'revenue']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    print("Missing columns:", missing_columns)
else:
    print("All required columns are present.")


Missing columns: ['converted', 'revenue']


In [3]:
import numpy as np

np.random.seed(42)

df['converted'] = np.random.choice([0, 1], size=len(df), p=[0.9, 0.1])
df['revenue'] = df['converted'].apply(lambda x: round(np.random.uniform(5, 50), 2) if x == 1 else 0.0)

print(df[['converted', 'revenue']].head())
print("\nTarget Distribution:\n", df['converted'].value_counts())
print("\nRevenue Summary:\n", df['revenue'].describe())


   converted  revenue
0          0     0.00
1          1    22.22
2          0     0.00
3          0     0.00
4          0     0.00

Target Distribution:
 converted
0    119380
1     12999
Name: count, dtype: int64

Revenue Summary:
 count    132379.000000
mean          2.726596
std           9.217200
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          50.000000
Name: revenue, dtype: float64


In [4]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

le = LabelEncoder()

df_encoded['page2_clothing_model'] = le.fit_transform(df_encoded['page2_clothing_model'])

print("Unique Encoded Values for 'page2_clothing_model':", df_encoded['page2_clothing_model'].nunique())
print(df_encoded[['page2_clothing_model']].head())


Unique Encoded Values for 'page2_clothing_model': 216
   page2_clothing_model
0                    88
1                    60
2                    80
3                    45
4                    66


In [5]:
from sklearn.preprocessing import StandardScaler
df_scaled = df_encoded.copy()

numerical_cols = ['month', 'day', 'order', 'country', 'page1_main_category',
                  'page2_clothing_model', 'colour', 'location', 'model_photography',
                  'price', 'price_2', 'page']

scaler = StandardScaler()

df_scaled[numerical_cols] = scaler.fit_transform(df_scaled[numerical_cols])

print(df_scaled[numerical_cols].head())


      month       day     order   country  page1_main_category  \
0  0.314174  0.848597  0.831323  0.286643             0.523645   
1 -0.438805  0.508811 -0.283182  0.286643            -0.349717   
2  1.067153  0.055762 -0.580383  0.286643             0.523645   
3 -0.438805 -1.416646 -0.580383  0.286643            -0.349717   
4  0.314174 -0.623811  0.459822  0.286643            -0.349717   

   page2_clothing_model    colour  location  model_photography     price  \
0              0.091727  1.597940 -1.319159           1.686940  0.335888   
1             -0.383512  1.597940 -0.152333          -0.592789  1.053629   
2             -0.044055  0.654173  1.014493          -0.592789  0.335888   
3             -0.638105 -0.997418  0.431080          -0.592789 -0.062857   
4             -0.281675  0.654173  1.014493          -0.592789  1.053629   

    price_2      page  
0 -0.977227  0.292634  
1 -0.977227  0.292634  
2 -0.977227 -0.723941  
3  1.023304 -0.723941  
4 -0.977227  0.292634  


 Feature Engineering

In [7]:
session_click_counts = df.groupby('session_id').size().rename("session_length")

df = df.merge(session_click_counts, on='session_id', how='left')

df['num_clicks'] = df['session_length']

print(df[['session_id', 'session_length', 'num_clicks']].head())


   session_id  session_length  num_clicks
0       15648              84          84
1       10018               9           9
2       19388              10          10
3        7181               6           6
4       13493              15          15


In [8]:
category_clicks = df.groupby(['session_id', 'page1_main_category']).size().unstack(fill_value=0)

category_clicks.columns = [f'category_{int(col)}_clicks' for col in category_clicks.columns]

df = df.merge(category_clicks, on='session_id', how='left')

print(df[[col for col in df.columns if 'category_' in col]].head())


   category_1_clicks  category_2_clicks  category_3_clicks  category_4_clicks
0                 35                 11                 36                  2
1                  2                  7                  0                  0
2                  3                  0                  5                  2
3                  0                  5                  1                  0
4                  0                 14                  1                  0


In [9]:
df['is_bounce'] = df['session_length'].apply(lambda x: 1 if x == 1 else 0)

exit_pages = df.groupby('session_id')['page'].max().reset_index()
exit_pages.rename(columns={'page': 'exit_page'}, inplace=True)
df = df.merge(exit_pages, on='session_id', how='left')

repeats = df.groupby(['session_id', 'page2_clothing_model']).size().reset_index(name='count')
repeats_flag = repeats[repeats['count'] > 1]['session_id'].unique()
df['repeated_views'] = df['session_id'].apply(lambda x: 1 if x in repeats_flag else 0)

print(df[['is_bounce', 'exit_page', 'repeated_views']].head())


   is_bounce  exit_page  repeated_views
0          0          4               1
1          0          2               1
2          0          3               1
3          0          2               1
4          0          2               1


In [11]:
drop_cols = ['year', 'session_id', 'converted', 'revenue']

X = df.drop(columns=drop_cols)

y_classification = df['converted']

y_regression = df['revenue']

print(" Final Features Shape:", X.shape)
print(" Classification Target Shape:", y_classification.shape)
print(" Regression Target Shape:", y_regression.shape)


 Final Features Shape: (132379, 21)
 Classification Target Shape: (132379,)
 Regression Target Shape: (132379,)


In [12]:
from sklearn.model_selection import train_test_split

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_classification, test_size=0.2, stratify=y_classification, random_state=42)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_regression, test_size=0.2, random_state=42)

print("Classification Train Shape:", X_train_clf.shape, y_train_clf.shape)
print("Classification Test Shape:", X_test_clf.shape, y_test_clf.shape)

print("Regression Train Shape:", X_train_reg.shape, y_train_reg.shape)
print("Regression Test Shape:", X_test_reg.shape, y_test_reg.shape)


Classification Train Shape: (105903, 21) (105903,)
Classification Test Shape: (26476, 21) (26476,)
Regression Train Shape: (105903, 21) (105903,)
Regression Test Shape: (26476, 21) (26476,)


In [16]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

df['page2_clothing_model'] = LabelEncoder().fit_transform(df['page2_clothing_model'])

drop_cols = ['year', 'session_id', 'converted', 'revenue']
X = df.drop(columns=drop_cols)
y_classification = df['converted']

from sklearn.model_selection import train_test_split

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_classification, test_size=0.2, random_state=42, stratify=y_classification
)

smote = SMOTE(random_state=42)
X_train_clf_bal, y_train_clf_bal = smote.fit_resample(X_train_clf, y_train_clf)

print(" After SMOTE:")
print("Class 0:", sum(y_train_clf_bal == 0))
print("Class 1:", sum(y_train_clf_bal == 1))
print("Balanced X shape:", X_train_clf_bal.shape)


 After SMOTE:
Class 0: 95504
Class 1: 95504
Balanced X shape: (191008, 21)


MODEL TRAINING

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

log_reg.fit(X_train_clf_bal, y_train_clf_bal)

y_pred_logreg = log_reg.predict(X_test_clf)

print(" Confusion Matrix:\n", confusion_matrix(y_test_clf, y_pred_logreg))
print(" Classification Report:\n", classification_report(y_test_clf, y_pred_logreg))
print("ROC-AUC Score:", roc_auc_score(y_test_clf, log_reg.predict_proba(X_test_clf)[:, 1]))


 Confusion Matrix:
 [[23321   555]
 [ 2528    72]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94     23876
           1       0.11      0.03      0.04      2600

    accuracy                           0.88     26476
   macro avg       0.51      0.50      0.49     26476
weighted avg       0.82      0.88      0.85     26476

ROC-AUC Score: 0.4969030535974329


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

rf_clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_clf.fit(X_train_clf_bal, y_train_clf_bal)

y_pred_rf = rf_clf.predict(X_test_clf)
y_proba_rf = rf_clf.predict_proba(X_test_clf)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_test_clf, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test_clf, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test_clf, y_proba_rf))


Confusion Matrix:
 [[22006  1870]
 [ 2398   202]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91     23876
           1       0.10      0.08      0.09      2600

    accuracy                           0.84     26476
   macro avg       0.50      0.50      0.50     26476
weighted avg       0.82      0.84      0.83     26476

ROC-AUC Score: 0.4994577029395467


In [19]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_clf.fit(X_train_clf_bal, y_train_clf_bal)

y_pred_xgb = xgb_clf.predict(X_test_clf)
y_proba_xgb = xgb_clf.predict_proba(X_test_clf)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_test_clf, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test_clf, y_pred_xgb))
print("\nROC-AUC Score:", roc_auc_score(y_test_clf, y_proba_xgb))


Confusion Matrix:
 [[20300  3576]
 [ 2217   383]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88     23876
           1       0.10      0.15      0.12      2600

    accuracy                           0.78     26476
   macro avg       0.50      0.50      0.50     26476
weighted avg       0.82      0.78      0.80     26476


ROC-AUC Score: 0.49852146345863885


In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

class_0 = sum(y_train_clf == 0)
class_1 = sum(y_train_clf == 1)
scale_pos_weight = class_0 / class_1
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

xgb_model_weighted = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model_weighted.fit(X_train_clf, y_train_clf)
y_pred_weighted = xgb_model_weighted.predict(X_test_clf)

print("Confusion Matrix:\n", confusion_matrix(y_test_clf, y_pred_weighted))
print("\nClassification Report:\n", classification_report(y_test_clf, y_pred_weighted))
print("ROC-AUC Score:", roc_auc_score(y_test_clf, y_pred_weighted))


scale_pos_weight: 9.18
Confusion Matrix:
 [[14532  9344]
 [ 1581  1019]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.61      0.73     23876
           1       0.10      0.39      0.16      2600

    accuracy                           0.59     26476
   macro avg       0.50      0.50      0.44     26476
weighted avg       0.82      0.59      0.67     26476

ROC-AUC Score: 0.500283870510458


REGRESSION

In [26]:
import numpy as np

np.random.seed(42)

price_weight = 0.6
session_length_weight = 0.2
click_weight = 0.2

df['revenue'] = df['converted'] * (
    (df['price'] * price_weight) +
    (df['session_length'] * session_length_weight) +
    (df['num_clicks'] * click_weight) +
    np.random.normal(loc=0, scale=2, size=len(df))  
)

df['revenue'] = df['revenue'].clip(lower=0).round(2)

print(df[['converted', 'price', 'session_length', 'num_clicks', 'revenue']].head())
print("\nRevenue Summary:\n", df['revenue'].describe())


   converted  price  session_length  num_clicks  revenue
0          0     48              84          84     0.00
1          1     57               9           9    37.52
2          0     48              10          10     0.00
3          0     43               6           6     0.00
4          0     57              15          15     0.00

Revenue Summary:
 count    132379.000000
mean          3.182146
std          10.140807
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         109.180000
Name: revenue, dtype: float64


In [27]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
all_values = pd.concat([X_train_reg['page2_clothing_model'], X_test_reg['page2_clothing_model']])
le.fit(all_values)

X_train_reg['page2_clothing_model'] = le.transform(X_train_reg['page2_clothing_model'])
X_test_reg['page2_clothing_model'] = le.transform(X_test_reg['page2_clothing_model'])


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, df['revenue'], test_size=0.2, random_state=42
)

lr_model = LinearRegression()
lr_model.fit(X_train_reg, y_train_reg)

y_pred_reg = lr_model.predict(X_test_reg)

mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


MAE: 5.6989
MSE: 100.0389
RMSE: 10.0019
R² Score: 0.0090


In [29]:
df_converted = df[df['converted'] == 1].copy()

print("Shape of converted dataset:", df_converted.shape)
print("Revenue summary:\n", df_converted['revenue'].describe())


Shape of converted dataset: (12999, 25)
Revenue summary:
 count    12999.000000
mean        32.406290
std         10.010638
min          9.370000
25%         25.320000
50%         31.230000
75%         38.060000
max        109.180000
Name: revenue, dtype: float64
