In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import (train_test_split, StratifiedKFold,
                                     cross_val_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, roc_auc_score, classification_report
from sklearn.utils import resample
from joblib import dump

# Load cleaned data
df = pd.read_csv('../data/clean/clean_deposits.csv')

# Create target: top 10% by rate
df['is_recommend'] = (df['rate'] >= df['rate'].quantile(0.9)).astype(int)

# Features
features = ['rate', 'term_months', 'min_amount', 'risk_level', 'goal_accumulation']
X = df[features]
y = df['is_recommend']

# ===== Stratified K-Fold Cross Validation =====
print('Cross-Validation Precision Scores:')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in [('LogisticRegression', LogisticRegression(class_weight='balanced', max_iter=1000)),
                    ('DecisionTree', DecisionTreeClassifier(max_depth=3, class_weight='balanced', random_state=42))]:
    scores = cross_val_score(model, X, y, cv=cv, scoring='precision')
    print(f"{name}: mean={scores.mean():.3f}, std={scores.std():.3f}")

# ===== Bootstrap Evaluation =====n
print('\nBootstrap Evaluation (DecisionTree):')
boot_precisions = []
for i in range(100):
    X_samp, y_samp = resample(X, y, stratify=y, random_state=i)
    model = DecisionTreeClassifier(max_depth=3, class_weight='balanced', random_state=42)
    model.fit(X_samp, y_samp)
    p = precision_score(y_samp, model.predict(X_samp))
    boot_precisions.append(p)
print(f"Bootstrap mean precision={np.mean(boot_precisions):.3f}, std={np.std(boot_precisions):.3f}")

# ===== Threshold Tuning for Logistic Regression =====n
print('\nThreshold Tuning (LogisticRegression):')
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
logreg.fit(X_train, y_train)
proba = logreg.predict_proba(X_test)[:,1]
best_thr, best_prec = 0.5, 0
for thr in np.linspace(0.1, 0.9, 9):
    preds = (proba >= thr).astype(int)
    p = precision_score(y_test, preds)
    if p > best_prec:
        best_prec, best_thr = p, thr
print(f"Best threshold={best_thr:.2f}, precision={best_prec:.3f}")

# ===== Final Model Pipeline and Save =====
import os
# Ensure models directory exists
os.makedirs('models', exist_ok=True)

print('\nTraining final DecisionTree pipeline and saving...')
pipe = Pipeline([
    ('clf', DecisionTreeClassifier(max_depth=3, class_weight='balanced', random_state=42))
])
pipe.fit(X_train, y_train)
dump(pipe, 'models/decision_tree_rec.joblib')
print('Model saved to models/decision_tree_rec.joblib')

Cross-Validation Precision Scores:
LogisticRegression: mean=0.783, std=0.296
DecisionTree: mean=1.000, std=0.000

Bootstrap Evaluation (DecisionTree):
Bootstrap mean precision=1.000, std=0.000

Threshold Tuning (LogisticRegression):
Best threshold=0.20, precision=1.000

Training final DecisionTree pipeline and saving...
Model saved to models/decision_tree_rec.joblib
