In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
data = pd.read_csv('train.csv')
data.drop('Loan_ID', axis=1, inplace=True)
data_NA = data.dropna()

In [39]:
data_NA.Dependents = data_NA.Dependents.astype('string')
data_NA.Dependents = data_NA.Dependents.apply(lambda x: x.replace('3+','3'))
data_NA.Dependents = data_NA.Dependents.astype('int64')

In [40]:
X_selected = data_NA[['Credit_History',  'ApplicantIncome', 'CoapplicantIncome', 'Dependents', 'LoanAmount','Loan_Amount_Term']]
y = data_NA['Loan_Status'].map({'N':0, 'Y': 1})

In [41]:
X_selected['ApplicantIncome'] = X_selected['ApplicantIncome'].astype('float64')
X_selected['Dependents'] = X_selected['Dependents'].astype('float64')

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.20, random_state=42, stratify=y)

In [43]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

**Here we will build simple models and tuned by GridSearch. We will use Logistic Regression, Random Forest, SVM**

In [45]:
## Random Forest

In [44]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_scaled, y_train)

pred_rf = rf_clf.predict(X_test_scaled)
print(classification_report(y_test, pred_rf, digits=5))

              precision    recall  f1-score   support

           0    0.73333   0.73333   0.73333        30
           1    0.87879   0.87879   0.87879        66

    accuracy                        0.83333        96
   macro avg    0.80606   0.80606   0.80606        96
weighted avg    0.83333   0.83333   0.83333        96



In [12]:
param_rf = [{'max_depth': [5, 9, 14, 21],
            'min_samples_split': range(2, 30, 5),
            'min_samples_leaf': range(2, 30, 5),
            'bootstrap': [True, False],
             'class_weight': [{0:0.5, 1:0.5}, {0:0.6, 1:0.4}, {0:0.7, 1:0.3}, {0:0.8, 1:0.2}]
            }]

In [13]:
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_rf, cv=5, verbose=True, n_jobs=-1, scoring='f1')
grid_rf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 12.8min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'bootstrap': [True, False],
                          'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4},
                                           {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}],
                          'max_depth': [5, 9, 14, 21],
                          'min_samples_leaf': range(2, 30, 5),
                          'min_samples_split': range(2, 30, 5)}],
             scoring='f1', verbose=True)

In [14]:
grid_rf.best_params_, grid_rf.best_score_

({'bootstrap': True,
  'class_weight': {0: 0.5, 1: 0.5},
  'max_depth': 5,
  'min_samples_leaf': 7,
  'min_samples_split': 22},
 0.8735682180658257)

In [15]:
best_rf = RandomForestClassifier(n_estimators=500, bootstrap=True, max_depth=5, min_samples_leaf=7, min_samples_split=22,
                                random_state=42)
best_rf.fit(X_train_scaled, y_train)
pred_best_rf = best_rf.predict(X_test_scaled)
print(classification_report(y_test, pred_best_rf, digits=4))

              precision    recall  f1-score   support

           0     0.8889    0.5333    0.6667        30
           1     0.8205    0.9697    0.8889        66

    accuracy                         0.8333        96
   macro avg     0.8547    0.7515    0.7778        96
weighted avg     0.8419    0.8333    0.8194        96



Accuracy remained the same, but after some tuning f-1 score the positive class has slightly increased

In [46]:
## Logistic Regression

In [16]:
param_log = [{'C': [0.01, 0.1, 1, 10, 100, 1000],
             'solver': ['liblinear', 'newton-cg', 'sag', 'saga','lbfgs'],
              'class_weight': [{0:0.5, 1:0.5}, {0:0.6, 1:0.4}, {0:0.7, 1:0.3}, {0:0.8, 1:0.2}]
             }]

In [17]:
grid_log = GridSearchCV(LogisticRegression(random_state=42, max_iter=10000), param_log, cv=5, verbose=True, n_jobs=-1, scoring='f1')
grid_log.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.6s finished


GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=10000, random_state=42),
             n_jobs=-1,
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100, 1000],
                          'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4},
                                           {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}],
                          'solver': ['liblinear', 'newton-cg', 'sag', 'saga',
                                     'lbfgs']}],
             scoring='f1', verbose=True)

In [18]:
grid_log.best_params_, grid_log.best_score_

({'C': 0.01, 'class_weight': {0: 0.6, 1: 0.4}, 'solver': 'newton-cg'},
 0.8735682180658257)

In [19]:
best_log = grid_log.best_estimator_
best_log.fit(X_train_scaled, y_train)
best_log_pred = best_log.predict(X_test_scaled)
print(classification_report(y_test, best_log_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8889    0.5333    0.6667        30
           1     0.8205    0.9697    0.8889        66

    accuracy                         0.8333        96
   macro avg     0.8547    0.7515    0.7778        96
weighted avg     0.8419    0.8333    0.8194        96



In [20]:
log_clf = LogisticRegression(random_state=42,max_iter=1000)
log_clf.fit(X_train_scaled, y_train)

pred_log = log_clf.predict(X_test_scaled)
print(classification_report(y_test, pred_log, digits=5))

              precision    recall  f1-score   support

           0    0.89474   0.56667   0.69388        30
           1    0.83117   0.96970   0.89510        66

    accuracy                        0.84375        96
   macro avg    0.86295   0.76818   0.79449        96
weighted avg    0.85103   0.84375   0.83222        96



Simple logistic regression worked better.

In [48]:
svc_clf = SVC(random_state=42, probability=True)
svc_clf.fit(X_train_scaled, y_train)
svc_pred = svc_clf.predict(X_test_scaled)
print(classification_report(y_test, svc_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8182    0.6000    0.6923        30
           1     0.8378    0.9394    0.8857        66

    accuracy                         0.8333        96
   macro avg     0.8280    0.7697    0.7890        96
weighted avg     0.8317    0.8333    0.8253        96



In [21]:
param_svc = [{'kernel': ['rbf'],
              'C': [0.01, 0.1, 1, 10, 100],
             'gamma': ['scale', 'auto']},
             {'kernel': ['poly'],
             'degree': range(2, 15, 3),
             'C': [0.01, 0.1, 1, 10, 100],
             'class_weight': [{0:0.5, 1:0.5}, {0:0.6, 1:0.4}, {0:0.7, 1:0.3}, {0:0.8, 1:0.2}]}]

In [22]:
greed_svc = GridSearchCV(SVC(random_state=42, probability=True), param_svc, cv=5, verbose=2, n_jobs=-1, scoring='f1')
greed_svc.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 110 candidates, totalling 550 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 534 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 550 out of 550 | elapsed:   19.5s finished


GridSearchCV(cv=5, estimator=SVC(probability=True, random_state=42), n_jobs=-1,
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100],
                          'gamma': ['scale', 'auto'], 'kernel': ['rbf']},
                         {'C': [0.01, 0.1, 1, 10, 100],
                          'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4},
                                           {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}],
                          'degree': range(2, 15, 3), 'kernel': ['poly']}],
             scoring='f1', verbose=2)

In [23]:
greed_svc.best_params_, greed_svc.best_score_

({'C': 0.1, 'class_weight': {0: 0.6, 1: 0.4}, 'degree': 5, 'kernel': 'poly'},
 0.872579745228648)

In [24]:
best_svc = greed_svc.best_estimator_
best_svc.fit(X_train_scaled, y_train)
best_svc_pred = best_svc.predict(X_test_scaled)
print(classification_report(y_test, best_svc_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8095    0.5667    0.6667        30
           1     0.8267    0.9394    0.8794        66

    accuracy                         0.8229        96
   macro avg     0.8181    0.7530    0.7730        96
weighted avg     0.8213    0.8229    0.8129        96



**We can conclude that simple Lofistion regression won. We will take it as the final model**

In [49]:
# Make pipeline and save the model

In [50]:
from sklearn.pipeline  import Pipeline
from sklearn.compose import ColumnTransformer

In [51]:
full_pipeline_with_predictor = Pipeline([
        ("preparation", StandardScaler()),
        ("rf", log_clf)
    ])

full_pipeline_with_predictor.fit(X_train, y_train)
pipe_pred = full_pipeline_with_predictor.predict(X_test)

In [52]:
print(classification_report(y_test, pipe_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8947    0.5667    0.6939        30
           1     0.8312    0.9697    0.8951        66

    accuracy                         0.8438        96
   macro avg     0.8630    0.7682    0.7945        96
weighted avg     0.8510    0.8438    0.8322        96



In [53]:
import dill as pickle
filename_norm = 'model_loan.pk' # имя файла для сохранения модели

In [54]:
with open(filename_norm, 'wb') as file:
    pickle.dump(full_pipeline_with_predictor, file)