# PERSON B — MODELLING STEP (Logistic Regression)

## 1. Load the data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load preprocessed data
X = pd.read_csv("../data/features.csv")
y = pd.read_csv("../data/target.csv").squeeze()

# Drop leakage column
if 'days_since_last_purchase' in X.columns:
    X = X.drop(columns=['days_since_last_purchase'])

# Validation Check
print(f"Features Shape: {X.shape}")
print(f"Target Shape: {y.shape}")
display(X.head())

Features Shape: (1200, 8)
Target Shape: (1200,)


Unnamed: 0,age,income,total_orders,avg_order_value,estimated_spend,spend_ratio,sentiment_score,risk_score
0,58,74592,22,133.88,2945.36,0.039486,1,1
1,61,131482,42,161.12,6767.04,0.051467,2,0
2,50,138907,50,219.59,10979.5,0.079042,2,0
3,44,64446,22,258.07,5677.54,0.088098,0,2
4,62,115392,32,204.42,6541.44,0.056689,0,2


## 2. Train/Test split

In [3]:
# Standard split for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train Size: {X_train.shape[0]}")
print(f"Test Size: {X_test.shape[0]}")

Train Size: 960
Test Size: 240


## 3. Train logistic regression

In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    penalty="l2",
    C=1.0,
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)

model.fit(X_train, y_train)

## 4. Evaluation

In [5]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC:  {roc_auc_score(y_test, y_prob):.4f}")

Accuracy: 0.9417
F1-Score: 0.9176
ROC-AUC:  0.9650
