In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- 1. Load Processed Data ---
# (PATH CORRECTION): We're in 'notebooks/', so we go up ('../') 
# and into 'data/'. I'm also using your exact filename.
data = pd.read_csv('../data/lead_data_processed.csv')

# --- 2. Define Features (X) and Target (y) ---
# 'converted' is our target variable (the thing we want to predict)
# All other columns are our features
X = data.drop('converted', axis=1) 
y = data['converted']

# --- 3. Split Data ---
# We use 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


# --- 4. Train Baseline Model: Logistic Regression ---
print("\n--- Training Logistic Regression (Baseline) ---")
log_model = LogisticRegression(max_iter=1000) # Increase max_iter for convergence
log_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_log = log_model.predict(X_test)

# Evaluate the model
print("Logistic Regression - Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log))


# --- 5. Train Challenger Model: XGBoost ---
print("\n--- Training XGBoost Classifier (Challenger) ---")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Classifier - Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))


# --- 6. Save the Best Model ---
# (PATH CORRECTION): We're in 'notebooks/', so we go up ('../')
# and into 'model/' to save the final model file.

# We'll assume XGBoost is better, as it usually is for this type of data.
# If your Logistic Regression report was better, change 'xgb_model' to 'log_model'.
model_to_save = xgb_model 
model_path = '../model/model.pkl'

with open(model_path, 'wb') as file:
    pickle.dump(model_to_save, file)

print(f"\n--- Best model (XGBoost) saved to {model_path} ---")

Training set shape: (800, 16)
Test set shape: (200, 16)

--- Training Logistic Regression (Baseline) ---
Logistic Regression - Test Set Performance:
Accuracy: 0.9100
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85        64
           1       0.92      0.95      0.93       136

    accuracy                           0.91       200
   macro avg       0.90      0.89      0.89       200
weighted avg       0.91      0.91      0.91       200


--- Training XGBoost Classifier (Challenger) ---
XGBoost Classifier - Test Set Performance:
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00       136

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


--- Best model (XGBoost

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
