In [1]:
import joblib
import numpy as np, pandas as pd

In [2]:
def warn(*args, **kwargs): 
    pass 
import warnings 
warnings.warn = warn

# Ignore divide by zero, overflow, and invalid value warnings
np.seterr(divide='ignore', over='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
data = joblib.load("../data/pre_processed_data.joblib")

X_train_df = data["X_train_df"]
X_test_df = data["X_test_df"]
y_train = data["y_train"]
y_test = data["y_test"]

# Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [5]:
# base Model
lr_base = LogisticRegression(
    max_iter=2000,
    random_state=2025,
    class_weight="balanced",
    solver="lbfgs"
)

In [6]:
lr_base.fit(X_train_df,y_train)

In [7]:
y_pred = lr_base.predict(X_test_df)
y_proba = lr_base.predict_proba(X_test_df)[:, 1]

In [8]:
# Metrics
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.90      0.58      0.71      4021
           1       0.17      0.57      0.26       591

    accuracy                           0.58      4612
   macro avg       0.53      0.57      0.48      4612
weighted avg       0.81      0.58      0.65      4612

ROC-AUC: 0.6064641175284915


In [9]:
thresholds = [0.25, 0.30, 0.35, 0.40]

y_proba = lr_base.predict_proba(X_test_df)[:, 1]

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    print(f"\n{18*'-'}Threshold = {t} {18*'-'}")
    print(classification_report(y_test, y_pred_t))


------------------Threshold = 0.25 ------------------
              precision    recall  f1-score   support

           0       0.92      0.02      0.03      4021
           1       0.13      0.99      0.23       591

    accuracy                           0.14      4612
   macro avg       0.52      0.50      0.13      4612
weighted avg       0.82      0.14      0.06      4612


------------------Threshold = 0.3 ------------------
              precision    recall  f1-score   support

           0       0.93      0.06      0.10      4021
           1       0.13      0.97      0.23       591

    accuracy                           0.17      4612
   macro avg       0.53      0.51      0.17      4612
weighted avg       0.82      0.17      0.12      4612


------------------Threshold = 0.35 ------------------
              precision    recall  f1-score   support

           0       0.93      0.14      0.24      4021
           1       0.14      0.93      0.24       591

    accuracy      

Lower thresholds predict almost all samples as positive.  
At threshold 0.35, recall stays high and the model starts to separate classes.

In [10]:
FINAL_THRESHOLD = 0.35

y_pred_final = (y_proba >= FINAL_THRESHOLD).astype(int)
print(classification_report(y_test, y_pred_final))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.93      0.14      0.24      4021
           1       0.14      0.93      0.24       591

    accuracy                           0.24      4612
   macro avg       0.53      0.53      0.24      4612
weighted avg       0.83      0.24      0.24      4612

ROC-AUC: 0.6064641175284915


In [11]:
from sklearn.metrics import average_precision_score

print("PR-AUC:", average_precision_score(y_test, y_proba))


PR-AUC: 0.1810564520460643


## Conclusion

I trained a Logistic Regression model on an imbalanced dataset.  
Since the positive class is rare, I focused mainly on **recall** as the evaluation metric.

Recall was prioritized because missing a positive case is more costly than predicting extra false positives. Precision was monitored but was not the main focus, as false negatives are more critical in this problem.  
I also used ROC AUC and PR AUC to assess overall model performance.


### Model Performance

- **ROC-AUC ≈ 0.61**  
  The model performs slightly better than random guessing but remains weak.

- **PR-AUC ≈ 0.18**  
  The model has limited ability to separate positive and negative cases.

### Recall-Based Evaluation

- Using the default threshold (0.5), recall for the positive class was moderate (~0.57), meaning many positives were missed.
- After testing multiple thresholds, I selected **0.35** as the final threshold.
- At this threshold, recall for the positive class increased to **~0.93**, but precision dropped, resulting in many false positives.

This shows that higher recall can be achieved only by accepting more false positives.


### Final Outcome

- The model is suitable when **recall is the priority** and missing positive cases is costly.
- Achieving high recall results in many false positives due to low precision.
- The model is not suitable when high precision is required.
- Overall, the Logistic Regression model shows limited performance on this imbalanced dataset.

GridSearchCV was not used because the Logistic Regression model showed limited predictive power. Threshold tuning improved recall but did not improve overall model performance, so further hyperparameter tuning was unlikely to help.

---


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=2025,
    n_jobs=-1
)

In [13]:
rf_model.fit(X_train_df,y_train)

In [14]:
y_proba_rf = rf_model.predict_proba(X_test_df)[:,1]

In [15]:
print("RF ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("RF PR-AUC:", average_precision_score(y_test, y_proba_rf))


RF ROC-AUC: 0.5941621630265136
RF PR-AUC: 0.1799262344977539


In [16]:

# Get feature importances
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": X_train_df.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feature_importance_df.head()

Unnamed: 0,feature,importance
4,prem_final_log,0.152462
0,polholder_age,0.127824
3,vehicl_agepurchase,0.098671
2,vehicl_age,0.090391
1,policy_age_log,0.065089


In [17]:
# Select top 15 features based on importance
top_15_features = feature_importance_df.head(15)["feature"].tolist()

# Create reduced training and test sets
X_train_top15 = X_train_df[top_15_features]
X_test_top15 = X_test_df[top_15_features]

# Retrain Logistic Regression on selected features
lr_top15 = LogisticRegression(
    max_iter=2000,
    random_state=2025,
    class_weight="balanced",
    solver="lbfgs"
)

lr_top15.fit(X_train_top15, y_train)

# Get probabilities
y_proba_top15 = lr_top15.predict_proba(X_test_top15)[:, 1]

# Evaluate
from sklearn.metrics import roc_auc_score, average_precision_score

print("Top-15 LR ROC-AUC:", roc_auc_score(y_test, y_proba_top15))
print("Top-15 LR PR-AUC:", average_precision_score(y_test, y_proba_top15))


Top-15 LR ROC-AUC: 0.6037575150089778
Top-15 LR PR-AUC: 0.18437957922715598


In [18]:
FINAL_THRESHOLD = 0.35

# Convert probabilities to predictions
y_pred_top15_final = (y_proba_top15 >= FINAL_THRESHOLD).astype(int)

# Final evaluation
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_pred_top15_final))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_top15))


              precision    recall  f1-score   support

           0       0.93      0.07      0.14      4021
           1       0.13      0.96      0.23       591

    accuracy                           0.19      4612
   macro avg       0.53      0.52      0.18      4612
weighted avg       0.83      0.19      0.15      4612

ROC-AUC: 0.6037575150089778


###  Finding

- Using all features, the model achieved high recall for the positive class (0.93) with low precision (0.14).
- After selecting the top 15 features, recall increased slightly (0.96), but precision and accuracy decreased.
- ROC-AUC remained almost the same in both cases, showing that overall model strength did not change.
- Feature selection simplified the model but did not significantly improve performance.

**Conclusion:**  
The model is suitable for high-recall use cases, but precision remains low.

In [19]:
# Train Random Forest on Top 15 features
rf_top15 = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=2025,
    n_jobs=-1
)

rf_top15.fit(X_train_top15, y_train)

# Get probabilities
y_proba_rf_top15 = rf_top15.predict_proba(X_test_top15)[:, 1]

# Evaluate
print("RF Top-15 ROC-AUC:", roc_auc_score(y_test, y_proba_rf_top15))
print("RF Top-15 PR-AUC:", average_precision_score(y_test, y_proba_rf_top15))

RF Top-15 ROC-AUC: 0.5893919444069229
RF Top-15 PR-AUC: 0.17559511547366247


Although tuning could slightly improve Random Forest performance, it is unlikely to lead to meaningful improvement because the main limitation comes from the data, not the model settings.