### Rahil Shaikh
### mohdrahilshaikh360@apsit.edu.in

## Model Selection 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE



In [2]:
train_data = pd.read_excel('train_data.xlsx')

In [3]:
train_data.head()

Unnamed: 0,customer_id,transaction_date,sub_grade,term,home_ownership,cibil_score,total_no_of_acc,annual_inc,int_rate,purpose,loan_amnt,application_type,installment,verification_status,account_bal,emp_length,loan_status
0,10608026,2014-01-01,C5,36 months,MORTGAGE,665,9,70000.0,16.24,debt_consolidation,7200,Individual,253.99,Verified,4648,11,0
1,10235120,2014-01-01,E5,36 months,MORTGAGE,660,8,65000.0,23.4,home_improvement,6000,Individual,233.52,Source Verified,14051,11,1
2,10705805,2014-01-01,D2,36 months,MORTGAGE,660,7,73000.0,17.57,other,8000,Individual,287.5,Verified,14885,11,0
3,11044991,2014-01-01,B4,36 months,MORTGAGE,690,5,118000.0,12.85,debt_consolidation,10000,Individual,336.22,Source Verified,7542,2,1
4,10161054,2014-01-01,C3,60 months,MORTGAGE,665,5,63000.0,14.98,debt_consolidation,10000,Individual,237.8,Verified,6318,11,0


In [4]:
train_data.isnull().sum()

customer_id            0
transaction_date       0
sub_grade              0
term                   0
home_ownership         0
cibil_score            0
total_no_of_acc        0
annual_inc             0
int_rate               0
purpose                0
loan_amnt              0
application_type       0
installment            0
verification_status    0
account_bal            0
emp_length             0
loan_status            0
dtype: int64

### There are not any missing Values


## As Mention In My EDA We have to normalize the Income and Loan Amount to Handle Outlier as there may have been some people with higher loan and income

In [5]:
# 2. Encode categorical variables
categorical_columns = ['purpose', 'home_ownership', 'verification_status', 'application_type']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le


### I am selecting the features 
- income and loan amounts 
- interest 
- debt_to_income_ratio 

to predict the target output

In [6]:

# Normalize the selected features 
scaler = StandardScaler()
numeric_columns = ['annual_inc', 'loan_amnt', 'account_bal', 'cibil_score']
train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns])


### I will be using income to debt ratio as a feature

In [7]:
# Create derived features
train_data['debt_to_income_ratio'] = train_data['installment'] / (train_data['annual_inc'] + 1e-6)


In [8]:
# Prepare data for modeling
X = train_data.drop(['loan_status', 'customer_id', 'transaction_date','term','sub_grade','home_ownership','total_no_of_acc','purpose','application_type','verification_status','installment',"account_bal","emp_length"], axis=1)
y = train_data['loan_status']

In [9]:
X

Unnamed: 0,cibil_score,annual_inc,int_rate,loan_amnt,debt_to_income_ratio
0,-0.848174,-0.022944,16.24,-0.875364,-11070.706527
1,-1.066790,-0.105580,23.40,-1.007359,-2211.801145
2,-1.066790,0.026638,17.57,-0.787367,10792.276454
3,0.244907,0.770368,12.85,-0.567375,436.440387
4,-0.848174,-0.138635,14.98,-0.567375,-1715.311090
...,...,...,...,...,...
113700,-0.629558,-0.303908,16.29,-0.567375,-1161.572795
113701,-1.066790,-0.518763,17.27,-0.894613,-484.634547
113702,0.244907,-0.766673,16.29,-0.567375,-460.444757
113703,-0.192326,0.142330,15.31,-0.278636,3088.372307


In [10]:
# Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [14]:
import joblib
models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "Adaboost":AdaBoostClassifier(),
    "Xgboost":XGBClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')
    
    if list(models.keys())[i] == "Random Forest":
        joblib.dump(model, 'random_forest_model.pkl')
        print("Random Forest model saved!")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logisitic Regression
Model performance for Training set
- Accuracy: 0.6365
- F1 score: 0.6348
- Precision: 0.6596
- Recall: 0.5684
- Roc Auc Score: 0.6367
----------------------------------
Model performance for Test set
- Accuracy: 0.6361
- F1 score: 0.6345
- Precision: 0.6498
- Recall: 0.5719
- Roc Auc Score: 0.6353


Decision Tree
Model performance for Training set
- Accuracy: 0.9956
- F1 score: 0.9956
- Precision: 0.9990
- Recall: 0.9923
- Roc Auc Score: 0.9956
----------------------------------
Model performance for Test set
- Accuracy: 0.7462
- F1 score: 0.7462
- Precision: 0.7465
- Recall: 0.7367
- Roc Auc Score: 0.7461


Random Forest
Model performance for Training set
- Accuracy: 0.9956
- F1 score: 0.9956
- Precision: 0.9939
- Recall: 0.9973
- Roc Auc Score: 0.9956
----------------------------------
Model performance for Test set
- Accuracy: 0.8024
- F1 score: 0.8022
- Precision: 0.7796
- Recall: 0.8369
- Roc Auc Score: 0.8028


Random Forest model saved!
Gradient Boost
Model 



Adaboost
Model performance for Training set
- Accuracy: 0.7253
- F1 score: 0.7253
- Precision: 0.7297
- Recall: 0.7182
- Roc Auc Score: 0.7253
----------------------------------
Model performance for Test set
- Accuracy: 0.7258
- F1 score: 0.7258
- Precision: 0.7229
- Recall: 0.7220
- Roc Auc Score: 0.7258


Xgboost
Model performance for Training set
- Accuracy: 0.8273
- F1 score: 0.8261
- Precision: 0.7819
- Recall: 0.9092
- Roc Auc Score: 0.8270
----------------------------------
Model performance for Test set
- Accuracy: 0.8074
- F1 score: 0.8061
- Precision: 0.7597
- Recall: 0.8927
- Roc Auc Score: 0.8084




### Chosen Model: RANDOM FOREST

Model performance for Training set
- Accuracy: 0.9956
- F1 score: 0.9956
- Precision: 0.9937
- Recall: 0.9976
- Roc Auc Score: 0.9956
----------------------------------
Model performance for Test set
- Accuracy: 0.8022
- F1 score: 0.8020
- Precision: 0.7790
- Recall: 0.8374
- Roc Auc Score: 0.8026

The Random Forest model shows a solid balance between accuracy, precision, recall, F1 score, and ROC AUC score on both the training and test sets. 
Its performance on the test set (accuracy: 0.8022, F1 score: 0.8020, ROC AUC: 0.8026) making it a reliable choice for the task.

### Predicting on Xlsx

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Load the saved Random Forest model
model = joblib.load('random_forest_model.pkl')

# Load the test data from the Excel file
test_data = pd.read_excel('test_data.xlsx')

test_data['debt_to_income_ratio'] = test_data['installment'] / (test_data['annual_inc'] + 1e-6)

# Drop the same columns in test data that were dropped in train data
test_features = test_data.drop(['customer_id', 'transaction_date', 'term', 'sub_grade', 'home_ownership', 
                                'total_no_of_acc', 'purpose', 'application_type', 'verification_status', 
                                'installment', 'account_bal', 'emp_length', 'loan_status'], axis=1)

test_features


Unnamed: 0,cibil_score,annual_inc,int_rate,loan_amnt,debt_to_income_ratio
0,690,120000.0,13.67,20000,0.005670
1,700,61277.0,19.99,9450,0.005731
2,660,92000.0,19.99,12700,0.003657
3,670,150000.0,14.46,25000,0.005734
4,660,68000.0,18.99,16000,0.006102
...,...,...,...,...,...
8450,690,65000.0,13.67,11000,0.003909
8451,660,43000.0,25.65,14975,0.010355
8452,705,52000.0,16.99,18000,0.008601
8453,740,72000.0,19.99,30000,0.011037


In [23]:
# Assuming the last column is the target variable, so we separate it:
X_test_new = test_features
y_test_new = test_data['loan_status']

# Predict using the loaded model
y_test_pred_new = model.predict(X_test_new)

# Calculate the accuracy on the new test data
accuracy = accuracy_score(y_test_new, y_test_pred_new)

print(f"Accuracy on the new test data: {accuracy:.4f}")

Accuracy on the new test data: 0.6387
