In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

LoanDf = pd.read_csv("Loan_Approval.csv")

# Show the first 5 rows to check it loaded correctly
LoanDf.head(5)


Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No,1
1,21,female,High School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes,0
2,25,female,High School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No,1
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No,1
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No,1


In [69]:
LoanDf.isnull().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

## Preprocessing

In [70]:
LoanDf.isnull().sum() # dataset has no missing values 

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [71]:
LoanDf["loan_status"].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
44995    False
44996    False
44997    False
44998    False
44999    False
Name: loan_status, Length: 45000, dtype: bool

In [72]:
LoanDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  int64  
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  int64  
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  int64  
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  int64  
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [73]:
LoanDf.shape # we have 45000 rows and 14 columns

(45000, 14)

In [74]:
LoanDf.describe() # statistical info on numerical columns

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,5.410333,9583.157556,11.006606,0.139725,5.867489,632.608756,0.222222
std,6.045108,80422.5,6.063532,6314.886691,2.978808,0.087212,3.879702,50.435865,0.415744
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47204.0,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67048.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95789.25,8.0,12237.25,12.99,0.19,8.0,670.0,0.0
max,144.0,7200766.0,125.0,35000.0,20.0,0.66,30.0,850.0,1.0


In [75]:
import statsmodels.api as sm

In [76]:
import warnings
warnings.filterwarnings("ignore")

In [77]:
from functools import partial

In [87]:
from dotenv import load_dotenv

from pathlib import Path

env_path = Path("../../.env-live")

if env_path.exists():
    print('envs Loaded')
    load_dotenv(dotenv_path=env_path)
from jrjModelRegistry.jrjModelRegistry import registerAJrjModel

In [79]:
def generalRegressionPredictor(self, transformedData):
    return self.predict(transformedData)

In [81]:
# No mapping needed since loan_status is already 1/0
# Identify categorical columns except target
categorical_cols = [col for col in LoanDf.select_dtypes(include=['object']).columns if col != 'loan_status']

# One-hot encode categorical columns
LoanDf_encoded = pd.get_dummies(LoanDf, columns=categorical_cols, drop_first=True)

# Separate features and target
X = LoanDf_encoded.drop('loan_status', axis=1)
y = LoanDf_encoded['loan_status']

In [89]:
sample_data = X.head(5).to_dict(orient='records')

In [82]:
print("Total NaNs in features:", X.isnull().sum().sum())


Total NaNs in features: 0


In [83]:
print("NaNs in y before split:", y.isnull().sum())

NaNs in y before split: 0


In [86]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8833333333333333
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      6990
           1       0.75      0.73      0.74      2010

    accuracy                           0.88      9000
   macro avg       0.83      0.83      0.83      9000
weighted avg       0.88      0.88      0.88      9000



In [90]:
dt_metadata = {
    "modelName": "LoanApprovalBinaryCalssifiactionModelDt",
    "version": "1.0.1",
    "params": dt.get_params(),  # All model hyperparameters
    "score": float(score),         # Accuracy
    "modelLibrary": "sklearn.tree.DecisionTreeClassifier",
    "libraryMetadata": {
        "feature_importances": dt.feature_importances_.tolist(),
        "n_features": int(dt.n_features_in_),
        "n_classes": int(dt.n_classes_),
        "classes": dt.classes_.tolist(),
        "depth": int(dt.get_depth()),
        "n_leaves": int(dt.get_n_leaves()),
        "classification_report": report
    },
     "sampleData": {
        "dataForTransfer": evaluationBinaryCalssifiactionSampleData
    }
}

NameError: name 'dt' is not defined

In [None]:
model1Fit.transformer = evaluationBinaryCalssifiactionTransformer
model1Fit.mainPredictor = partial(generalRegressionPredictor, model1Fit)
registerAJrjModel(
    model1Fit,
    {
        "modelName":f"LoanApprovalBinaryCalssificationModel",
        "version":"1.0.1",
        "params": model1Fit.params.to_dict(),
        "score": accuracy,
        "otherEvaluationMetrics": {
            "accuracy": accuracy,
            "recall": recall,
            "precision": precision,
            "sensitivity": sensitivity,
            "specificity": specificity,
            "f1Score": f1Score,
            "roc_auc": roc_auc,
        },
        "modelLibrary": "statsmodels.api.Logit",
        "libraryMetadata": {
            "pvalues": model1Fit.pvalues.to_dict(),
            "pseudo_r_squared": float(model1Fit.prsquared),
            "llf": float(model1Fit.llf),
            "aic": float(model1Fit.aic),
            "bic": float(model1Fit.bic)
        },
    
        "sampleData": {
            "dataForTransfer": evaluationBinaryCalssifiactionSampleData
        }
    }
)

In [None]:
registerAJrjModel(
    dt,
    dt_metadata
)

In [None]:
JRJ_MODEL_REGISTRY_S3_ENDPOINT="s3.ca-central-1.wasabisys.com/273-g3"
JRJ_MODEL_REGISTRY_S3_REGION="ca-central-1"
JRJ_MODEL_REGISTRY_S3_KEY_ID="Q57X4P0GR5L8FI6580K7"
JRJ_MODEL_REGISTRY_S3_KEY_SECRET="eIktlWPqaH5XnMvUMaWWkYWbAZgv88m03mqDKCIy"
JRJ_MODEL_REGISTRY_S3_BUCKET_NAME="273-g3"
JRJ_MODEL_REGISTRY_S3_ZIP_PASSWORD="zsAfrtkjvFry"
JRJ_MONGODB_MODEL_REGISTRY="mongodb+srv://user_273:MhEFcjKxrvoP8jTk@cluster0.2hh17ab.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
