# Train Credit Intelligence Models

This notebook generates synthetic data, trains the payment-priority and spending-pattern models,
and saves confusion matrix images. Run cells in order.

In [None]:
# Setup imports and paths
import sys, os
# Add project root so `app` package is importable
sys.path.append('..')
from app.ml.data_generator import CreditDataGenerator
from app.ml.models import CreditIntelligenceModels
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Create folder for figures inside the notebook directory
os.makedirs('figures', exist_ok=True)
print('Setup complete')

In [None]:
# Generate synthetic data
gen = CreditDataGenerator(seed=42)
payment_data = gen.generate_payment_priority_data(n_scenarios=2000)  # scenarios for classifier
pattern_data = gen.generate_spending_pattern_data(n_samples=3000)   # samples for classifier
# Transaction-level data used for utilization predictor (regression)
txn_data = gen.generate_transaction_data(n_users=200, n_months=6)

print('Payment data shape:', payment_data.shape)
print('Spending pattern data shape:', pattern_data.shape)
print('Transaction data shape:', txn_data.shape)
payment_data.head()

In [None]:
pattern_data.head()

In [None]:

txn_data.head()

In [None]:
# Instantiate models manager (use trained_models dir under app/ml)
models = CreditIntelligenceModels(model_dir='../app/ml/trained_models')

# Train payment priority model and show metrics
pp_metrics = models.train_payment_priority_model(payment_data)
models.save_scalers_and_encoders()
print('Payment priority metrics:', pp_metrics)

# Train spending pattern model and show metrics
sp_metrics = models.train_spending_pattern_model(pattern_data)
models.save_scalers_and_encoders()
print('Spending pattern metrics:', sp_metrics)

In [None]:
# Evaluate classifiers and the utilization regressor; add extra charts
sns.set(style='whitegrid')

# --- Payment Priority evaluation ---
features_pp = ['balance','credit_limit','utilization','interest_rate','minimum_payment','days_until_due','available_funds','total_owed']
X_pp = payment_data[features_pp]
y_pp = payment_data['priority']
scaler_pp = models.scalers.get('payment_priority', StandardScaler().fit(X_pp))
X_pp_scaled = scaler_pp.transform(X_pp)
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_pp_scaled, y_pp, test_size=0.2, random_state=42)
y_pred_pp = models.payment_priority_model.predict(X_test_pp)
acc_pp = accuracy_score(y_test_pp, y_pred_pp)
cm_pp = confusion_matrix(y_test_pp, y_pred_pp)
plt.figure(figsize=(6,4))
sns.heatmap(cm_pp, annot=True, fmt='d', cmap='Blues')
plt.title(f'Payment Priority Confusion Matrix (acc={acc_pp:.3f})')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('figures/payment_priority_confusion.png', dpi=150)
plt.show()
print('\nClassification report for Payment Priority:')
print(classification_report(y_test_pp, y_pred_pp))

# Feature importances for payment priority
try:
    importances_pp = models.payment_priority_model.feature_importances_
    plt.figure(figsize=(8,3))
    sns.barplot(x=importances_pp, y=features_pp)
    plt.title('Payment Priority Feature Importances')
    plt.tight_layout()
    plt.savefig('figures/payment_priority_feature_importances.png', dpi=150)
    plt.show()
except Exception as e:
    print('Could not plot payment priority importances:', e)

# --- Spending Pattern evaluation ---
features_sp = ['credit_limit','monthly_spending','utilization','transaction_frequency','avg_transaction_amount','groceries_pct','dining_pct','shopping_pct']
X_sp = pattern_data[features_sp]
y_sp = pattern_data['spending_pattern']
encoder_sp = models.encoders.get('spending_pattern', LabelEncoder().fit(y_sp))
y_sp_enc = encoder_sp.transform(y_sp)
scaler_sp = models.scalers.get('spending_pattern', StandardScaler().fit(X_sp))
X_sp_scaled = scaler_sp.transform(X_sp)
X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(X_sp_scaled, y_sp_enc, test_size=0.2, random_state=42)
y_pred_sp = models.spending_pattern_model.predict(X_test_sp)
acc_sp = accuracy_score(y_test_sp, y_pred_sp)
cm_sp = confusion_matrix(y_test_sp, y_pred_sp)
plt.figure(figsize=(6,4))
sns.heatmap(cm_sp, annot=True, fmt='d', cmap='Greens')
plt.title(f'Spending Pattern Confusion Matrix (acc={acc_sp:.3f})')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('figures/spending_pattern_confusion.png', dpi=150)
plt.show()
print('\nClassification report for Spending Pattern:')
print(classification_report(y_test_sp, y_pred_sp, target_names=encoder_sp.classes_))

# Feature importances for spending pattern
try:
    importances_sp = models.spending_pattern_model.feature_importances_
    plt.figure(figsize=(8,3))
    sns.barplot(x=importances_sp, y=features_sp)
    plt.title('Spending Pattern Feature Importances')
    plt.tight_layout()
    plt.savefig('figures/spending_pattern_feature_importances.png', dpi=150)
    plt.show()
except Exception as e:
    print('Could not plot spending pattern importances:', e)

# --- Utilization predictor (regression) ---
# Train using transaction-level data and show regression diagnostics
print('Training utilization predictor (may take a moment)...')
up_metrics = models.train_utilization_predictor(txn_data)
models.save_scalers_and_encoders()
print('Utilization predictor metrics:', up_metrics)

# Recreate monthly aggregated dataset (same steps as training function)
df = txn_data.copy()
df['month'] = pd.to_datetime(df['transaction_date']).dt.to_period('M')
monthly = df.groupby(['user_id', 'card_id', 'month']).agg({
    'amount': 'sum',
    'transaction_date': 'count',
    'utilization_at_transaction': 'last',
    'credit_limit': 'first'
}).reset_index()
monthly.columns = ['user_id','card_id','month','monthly_spending','transaction_count','utilization','credit_limit']
monthly = monthly.sort_values(['user_id','card_id','month'])
monthly['next_utilization'] = monthly.groupby(['user_id','card_id'])['utilization'].shift(-1)
monthly['prev_utilization'] = monthly.groupby(['user_id','card_id'])['utilization'].shift(1)
monthly['spending_trend'] = monthly.groupby(['user_id','card_id'])['monthly_spending'].pct_change()
monthly = monthly.dropna(subset=['next_utilization'])
monthly['spending_trend'] = monthly['spending_trend'].fillna(0)
features_up = ['utilization','monthly_spending','transaction_count','credit_limit','spending_trend']
X_up = monthly[features_up]
y_up = monthly['next_utilization']
scaler_up = models.scalers.get('utilization_predictor', StandardScaler().fit(X_up))
X_up_scaled = scaler_up.transform(X_up)
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_up_scaled, y_up, test_size=0.2, random_state=42)
y_pred_up = models.utilization_predictor.predict(X_test_up)
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test_up, y_pred_up)
mae = mean_absolute_error(y_test_up, y_pred_up)
mse = mean_squared_error(y_test_up, y_pred_up)
print(f'Utilization predictor R2: {r2:.3f}, MAE: {mae:.3f}, MSE: {mse:.3f}')

# Scatter plot: true vs predicted
plt.figure(figsize=(6,4))
plt.scatter(y_test_up, y_pred_up, alpha=0.5)
plt.plot([0,100],[0,100], '--', color='red')
plt.xlabel('True Next Utilization (%)')
plt.ylabel('Predicted Next Utilization (%)')
plt.title('Utilization: True vs Predicted')
plt.tight_layout()
plt.savefig('figures/utilization_true_vs_pred.png', dpi=150)
plt.show()

# Residuals histogram
residuals = y_test_up - y_pred_up
plt.figure(figsize=(6,4))
sns.histplot(residuals, bins=40, kde=True)
plt.title('Utilization Prediction Residuals')
plt.xlabel('Residual (True - Pred)')
plt.tight_layout()
plt.savefig('figures/utilization_residuals.png', dpi=150)
plt.show()

# Feature importances for utilization predictor (regressor)
try:
    importances_up = models.utilization_predictor.feature_importances_
    plt.figure(figsize=(8,3))
    sns.barplot(x=importances_up, y=features_up)
    plt.title('Utilization Predictor Feature Importances')
    plt.tight_layout()
    plt.savefig('figures/utilization_feature_importances.png', dpi=150)
    plt.show()
except Exception as e:
    print('Could not plot utilization importances:', e)

How to use:

- Open this notebook from `credit-intelligence-service/notebooks/train_models.ipynb`.
- Run all cells. Figures are saved under `credit-intelligence-service/notebooks/figures/`.
- Use the saved PNG files for screenshots in your appendix.