In [9]:
# scripts/train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib
from transform import clean_data
from ingest import load_latest_data

# Load and clean one data chunk
df = load_latest_data()
df = clean_data(df)
df = df.fillna(0)

# Separate features and target
X = df.drop(columns=["loan_status"])
y = df["loan_status"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build pipeline with scaling
model = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Train model
model.fit(X_train, y_train)

# Save to file
joblib.dump(model, r"C:\Users\phiri\Documents\Projects_Mumu\Credit_Risk_Pipeline\models\credit_model.pkl")
print("Model trained and saved.")


Loading data from: C:\Users\phiri\Documents\Projects_Mumu\Credit_Risk_Pipeline\data\incoming\credit_data_2025-05-22.csv
Loaded 3259 records.
Model trained and saved.


In [11]:
df.head(25)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,41,75000,0.0,1.0,4,1,20000,11.14,0,0.27,0,11
1,39,21600,1.0,0.0,3,0,6000,6.03,0,0.28,0,12
2,36,76464,0.0,8.0,3,2,20000,13.22,0,0.26,1,12
3,50,76500,0.0,1.0,4,1,20000,11.99,0,0.26,0,17
4,38,21600,1.0,1.0,1,2,4750,15.23,0,0.22,1,16
5,36,21600,1.0,0.0,5,0,2100,8.49,0,0.1,0,12
6,41,21600,1.0,1.0,2,2,3500,13.61,0,0.16,0,12
7,36,22000,2.0,0.0,1,0,8000,6.99,0,0.36,0,17
8,36,84000,0.0,0.0,1,1,20000,9.64,1,0.24,0,16
9,40,73100,0.0,0.0,3,3,20000,14.54,1,0.23,1,11


In [None]:
df.to_(r"C:\Users\phiri\Documents\Projects_Mumu\Credit_Risk_Pipeline\saved_data.csv", index = False, sep = ",", decimal = ".")