### Model Training and Testing 

In [1]:
import os
import sys
system_path = os.path.abspath('..')
if system_path not in sys.path:
    sys.path.append(system_path) 

%load_ext autoreload
%autoreload 2

In [5]:
from src.data_processing import load_data
from src.train import split_data, train_models

In [52]:
# Load processed data
data = load_data("../data/processed/final_data.csv")
# Drop CustomerId column if present
if "CustomerId" in data.columns:
    data = data.drop(["CustomerId", 'Unnamed: 0'], axis=1)
data

Successfully loaded data from ../data/processed/final_data.csv. Shape: (3742, 26)


Unnamed: 0,mode_ProviderId_ProviderId_6,mode_PricingStrategy_2,avg_transaction_dayofweek,mode_ProviderId_ProviderId_5,unique_PricingStrategy_count,avg_transaction_amount,avg_transaction_day,unique_ProviderId_count,mode_ChannelId_ChannelId_2,mode_ChannelId_ChannelId_3,transaction_count,total_transaction_amount,avg_value,min_transaction_year,unique_ChannelId_count,std_transaction_amount,max_transaction_year,avg_transaction_hour,avg_transaction_month,mode_ProviderId_ProviderId_1,mode_PricingStrategy_4,unique_ProductCategory_count,total_value,is_high_risk
0,0.0,1.0,-0.529395,0.0,4.184641,-0.088217,-0.273141,2.167925,0.0,1.0,0.964082,-0.022752,-0.101792,-0.829377,2.304899,-0.109030,0.703142,-0.228171,0.602668,0.0,0.0,0.924070,-0.018139,0
1,1.0,1.0,0.635377,0.0,-0.756642,-0.090744,0.554422,-1.382737,0.0,1.0,-0.243141,-0.062842,-0.109037,-0.829377,-1.404749,-0.140432,-1.422187,-1.941755,1.038381,0.0,0.0,-1.153977,-0.092838,1
2,0.0,1.0,-0.535687,0.0,0.890452,-0.057828,0.163884,1.280259,0.0,1.0,0.128312,0.020976,-0.066973,-0.829377,2.304899,0.036788,0.703142,0.308814,0.310282,0.0,0.0,3.002117,0.012344,0
3,1.0,1.0,-0.094138,0.0,-0.756642,-0.081798,-0.093946,-1.382737,0.0,1.0,-0.253459,-0.062474,-0.100078,-0.829377,-1.404749,-0.140432,-1.422187,-2.455398,1.038381,0.0,0.0,-1.153977,-0.092470,1
4,0.0,1.0,-0.119294,1.0,0.890452,-0.074518,-0.247965,0.392594,0.0,1.0,0.035449,-0.028833,-0.085291,-0.829377,0.450075,-0.083724,0.703142,-0.967603,0.647742,0.0,0.0,-0.114953,-0.045407,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,0.0,1.0,-1.553168,0.0,-0.756642,-0.092265,-0.526192,0.392594,1.0,0.0,-0.222505,-0.062849,-0.107514,1.205724,0.450075,-0.130531,0.703142,1.910570,-0.922326,0.0,0.0,-0.114953,-0.092094,0
3738,0.0,1.0,-1.553168,0.0,-0.756642,-0.019177,-0.526192,-1.382737,0.0,1.0,-0.243141,-0.054008,-0.037365,1.205724,-1.404749,-0.103939,0.703142,2.167392,-0.922326,1.0,0.0,-1.153977,-0.084000,0
3739,0.0,1.0,-0.823653,0.0,-0.756642,-0.084082,-0.382110,0.392594,1.0,0.0,-0.222505,-0.060829,-0.098985,1.205724,0.450075,-0.105116,0.703142,-1.428111,-0.922326,0.0,0.0,0.924070,-0.089991,0
3740,0.0,1.0,-0.823653,1.0,-0.756642,-0.063906,-0.382110,-1.382737,0.0,1.0,-0.253459,-0.061370,-0.082160,1.205724,-1.404749,-0.140432,0.703142,-1.428111,-0.922326,0.0,0.0,-1.153977,-0.091365,0


In [53]:
def main():
    """Main training workflow."""
    X = data.drop("is_high_risk", axis=1)
    y = data["is_high_risk"]
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Train models
    results = train_models(X_train, y_train, X_test, y_test)
    
    return results

if __name__ == "__main__":
    main()

INFO:src.train:Data split into train (2993) and test (749) sets
INFO:src.train:
Evaluation metrics for logistic_regression:
INFO:src.train:accuracy: 0.9426
INFO:src.train:precision: 0.9424
INFO:src.train:recall: 0.9066
INFO:src.train:f1: 0.9242
INFO:src.train:roc_auc: 0.9785
INFO:src.train:
Classification Report:
INFO:src.train:              precision    recall  f1-score   support

           0       0.94      0.97      0.95       460
           1       0.94      0.91      0.92       289

    accuracy                           0.94       749
   macro avg       0.94      0.94      0.94       749
weighted avg       0.94      0.94      0.94       749

INFO:src.train:
Evaluation metrics for random_forest:
INFO:src.train:accuracy: 0.9786
INFO:src.train:precision: 0.9892
INFO:src.train:recall: 0.9550
INFO:src.train:f1: 0.9718
INFO:src.train:roc_auc: 0.9968
INFO:src.train:
Classification Report:
INFO:src.train:              precision    recall  f1-score   support

           0       0.97     

### Most Important Features for prediction

In [55]:
import joblib
import numpy as np

# Load the trained best model
best_model = joblib.load("../model/best_model.pkl")

# Try to get feature importances or coefficients
feature_names = None
if hasattr(data, "columns"):
    feature_names = data.drop("is_high_risk", axis=1).columns
else:
    feature_names = [f"feature_{i}" for i in range(data.shape[1] - 1)]

importances = None
if hasattr(best_model, "feature_importances_"):
    importances = best_model.feature_importances_
elif hasattr(best_model, "coef_"):
    coef = best_model.coef_
    if coef.ndim == 1:
        importances = coef
    else:
        importances = np.linalg.norm(coef, axis=0)
elif hasattr(best_model, "steps"):
    # If it's a pipeline, try to get last estimator
    last_step = best_model.steps[-1][1]
    if hasattr(last_step, "feature_importances_"):
        importances = last_step.feature_importances_
    elif hasattr(last_step, "coef_"):
        coef = last_step.coef_
        if coef.ndim == 1:
            importances = coef
        else:
            importances = np.linalg.norm(coef, axis=0)

if importances is not None:
    # Get top 10 features
    indices = np.argsort(np.abs(importances))[::-1][:10]
    print("------  Top 10 important features of the best model -------")
    print("------------------------------------------------------------")
    for rank, idx in enumerate(indices, 1):
        print(f"{rank}. {feature_names[idx]}: {importances[idx]:.4f}")
else:
    print("Best model does not provide feature importances or coefficients.")


------  Top 10 important features of the best model -------
------------------------------------------------------------
1. max_transaction_year: 0.7735
2. avg_transaction_month: 0.1468
3. avg_transaction_day: 0.0532
4. total_value: 0.0067
5. transaction_count: 0.0066
6. avg_transaction_dayofweek: 0.0030
7. total_transaction_amount: 0.0021
8. avg_transaction_amount: 0.0020
9. std_transaction_amount: 0.0017
10. avg_transaction_hour: 0.0015


### Predict Using sample data

In [47]:
data_test = [0.0,-0.823653,0.0,-0.756642,	-0.081798,	-0.382110,	-1.382737,	0.0,	1.0,	-0.253459,	-0.062474,	-0.100078,	1.205724,	-1.404749,	-0.140432,	0.703142,	-0.914468,	-0.922326,	1.0,	0.0,	-1.153977,	-0.092470,	0]

In [48]:
len(data_test)

23

In [49]:
import joblib
import numpy as np

# Load the trained model from the pickle file
model = joblib.load("../model/best_model.pkl")

# Convert the data to a 2D numpy array for prediction
data_array = np.array(data_test).reshape(1, -1)

print(model)
# Make predictions (probabilities for the positive class '1')
risk_probabilities = model.predict_proba(data_array)[:, 1]
# Predict the binary label (0 or 1)
predicted_high_risk = model.predict(data_array)

print(f"Credit risk for data is {predicted_high_risk}")
print(f"Credit risk probablity for data is {risk_probabilities}")

GradientBoostingClassifier(subsample=0.8)
Credit risk for data is [1]
Credit risk probablity for data is [0.98613075]


In [56]:
model