# Models

# Table of contents:
* 1 [Preparation](#intro-bullet)
* 2 [Logistic Regression](#first-bullet)
* 3 [Random Forest](#second-bullet)
* 3 [Balanced Random Forest](#third-bullet)
* 4 [Naive Bayes](#fourth-bullet)
* 5 [KNN](#fifth-bullet)
* 6 [Decision Tree](#sixth-bullet)
* 7 [Stacking](#seventh-bullet)
* 8 [Boosting](#eigth-bullet)
    * 8.1 [LigthGBM](#nineth-bullet)
    * 8.2 [Gradient Boosting](#tenth-bullet)
    * 8.3 [Histogram Gradient Boosting 1](#11-bullet)
* 9 [Bagging](#12-bullet)
* 10 [Plotting](#13-bullet)

## 1. Preparation <a class="anchor" id="intro-bullet"></a>

In [None]:
import pandas as pd
import numpy as np
import numpy as mean
import numpy as std
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, roc_auc_score, confusion_matrix
%matplotlib inline

## REMARK
the clf/model/rf or other names assigned to names of classifiers are estimator instances. There are firsty fitted to the model; that is, it must learn from the model. This is done by passing our training set to the **fit** method. Then we use **predic** or **predict_proba** to predict values of TARGET column or probability of such value

In [None]:
# Basic feature engineering
tmp_data1 = pd.read_csv("./featureData1.csv")

# split into train test sets
train_data1, test_data1 = train_test_split(tmp_data1,test_size=0.2)
test_data1.reset_index(inplace = True, drop = True)
train_data1.reset_index(inplace = True, drop = True)

# More detailed feature engineering
tmp_data2 = pd.read_csv("./featureData2.csv")

# split into train test sets
train_data2, test_data2 = train_test_split(tmp_data2,test_size=0.2)
test_data2.reset_index(inplace = True, drop = True)
train_data2.reset_index(inplace = True, drop = True)

In [None]:
train_data2

In [None]:
# drop the columns "Unnamed: 0"
test_data1=test_data1.drop(columns='Unnamed: 0')
train_data1=train_data1.drop(columns='Unnamed: 0')

test_data2=test_data2.drop(columns='Unnamed: 0')
train_data2=train_data2.drop(columns='Unnamed: 0')

## 2. Logistic Regression <a class="anchor" id="first-bullet"></a>

In [None]:
from sklearn.linear_model import LogisticRegression
def LogRegModel(train, test):
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET'])
    test= test.drop(columns = ['TARGET'])
    
    # Make the model with the specified regularization parameter
    # log_reg = LogisticRegression(C = 0.0001, class_weight="balanced")
    weights = {0:0.0878, 1:0.9122}
    log_reg = LogisticRegression(C = 0.0001, class_weight=weights)

    # Train on the training data
    log_reg.fit(train, labels)
    # Select only second column(TARGET)
    log_reg_pred = log_reg.predict_proba(test)[:, 1]
    tescik = log_reg.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, log_reg_pred))
    print('F1 score: %f' % f1)
    return roc_auc_score(test_labels, log_reg_pred), log_reg_pred

In [None]:
LRScore1 = LogRegModel(train_data1,test_data1)
LRScore2 = LogRegModel(train_data2,test_data2)

## 3. Random Forest <a class="anchor" id="second-bullet"></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

def RanForModel(train,test):
    rf = RandomForestClassifier(n_estimators=100,
                                max_depth=10,min_samples_split=20,
                                min_samples_leaf=6,
                                max_features='auto')
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET'])
    test= test.drop(columns = ['TARGET'])
    
    rf.fit(X = train, y = labels)
    # Select only second column(TARGET)
    ran_for_pred = rf.predict_proba(test)[:, 1]
    
    tescik = rf.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('F1 score: %f' % f1)
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, ran_for_pred))
    return roc_auc_score(test_labels, ran_for_pred),ran_for_pred
   


In [None]:
RFScore1 = RanForModel(train_data1,test_data1)
RFScore2 = RanForModel(train_data2,test_data2)


## 4. Balanced Random Forest <a class="anchor" id="third-bullet"></a>

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

def imbalancedRanFor(train,test):
    rf = BalancedRandomForestClassifier(class_weight='balanced')
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET'])
    test= test.drop(columns = ['TARGET'])
    
    rf.fit(X = train, y = labels)
    # Select only second column(TARGET)
    ran_for_pred = rf.predict_proba(test)[:, 1]
    
    tescik = rf.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('F1 score: %f' % f1)
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, ran_for_pred))
    
    return roc_auc_score(test_labels, ran_for_pred),ran_for_pred

In [None]:
imbRFScore1 =imbalancedRanFor(train_data1,test_data1)
imbRFScore2 =imbalancedRanFor(train_data2,test_data2)


## 5. Naive Bayesian<a class="anchor" id="fourth-bullet"></a>

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score

def NaiveBayModel(train,test):
    
    
    clf = BernoulliNB()
    
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    clf.fit(X = train, y = labels)
    clf_pred = clf.predict_proba(test)[:, 1]
    tescik = clf.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, clf_pred))
    f1 = f1_score(test_labels, tescik)
    print('F1 score: %f' % f1)
    return roc_auc_score(test_labels, clf_pred),clf_pred
  


In [None]:
NBScore1 = NaiveBayModel(train_data1,test_data1)
NBScore2 = NaiveBayModel(train_data2,test_data2)


## 6. K-Nearest Neighbors <a class="anchor" id="fifth-bullet"></a>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

def KNNModel(train,test):
    
    
    clf = KNeighborsClassifier(n_neighbors=3)
    
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    clf.fit(X = train, y = labels)
    clf_pred = clf.predict_proba(test)[:, 1]
    tescik = clf.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, clf_pred))
    f1 = f1_score(test_labels, tescik)
    print('F1 score: %f' % f1)
    return roc_auc_score(test_labels, clf_pred), clf_pred

In [None]:
KNNScore1 = KNNModel(train_data1,test_data1)
KNNScore2 = KNNModel(train_data2,test_data2)



## 6. Decision Tree <a class="anchor" id="sixth-bullet"></a>

In [None]:
from sklearn.tree import DecisionTreeClassifier
def DTModel(train,test):
    
    weights = {0:0.0878, 1:0.9122}
   
    clf = DecisionTreeClassifier( class_weight=weights)
    
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    clf.fit(X = train, y = labels)
    clf_pred = clf.predict_proba(test)[:, 1]
    tescik = clf.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, clf_pred))
    f1 = f1_score(test_labels, tescik)
    print('F1 score: %f' % f1)
    
    return roc_auc_score(test_labels, clf_pred),clf_pred

In [None]:
DTScore1 = DTModel(train_data1,test_data1)
DTScore2 = DTModel(train_data2,test_data2)


In [None]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
def extra(train,test):
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    
    # define dataset
    X = train
    y = labels
    # define pipeline
    steps = [('under', RandomUnderSampler()), ('model', DecisionTreeClassifier())]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('F1 Score: %.3f' % score)
    pipeline.fit(X,y)
    model_pred = pipeline.predict_proba(test)[:,1]
    tescik = pipeline.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, model_pred)), model_pred
    print('F1 score: %f' % f1)
    
    return roc_auc_score(test_labels, model_pred), model_pred

In [None]:
extra1 = extra(train_data1,test_data1)
extra2 = extra(train_data2,test_data2)

## 7. Stacking <a class="anchor" id="seventh-bullet"></a>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from matplotlib import pyplot
import re
def fun2(train, test):
    # define dataset
  
    # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    X = train
    y = labels
    # define the base models
    level0 = list()  
    level0.append(('dt', DecisionTreeClassifier()))
    weights = {0:0.9122, 1:0.0878}
    level0.append(('lr', LogisticRegression(C = 0.0001, class_weight=weights)))
    level0.append(('sth',BalancedRandomForestClassifier()))
    # define meta learner model
   
    level1 =BalancedRandomForestClassifier()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=2)
    # fit the model on all available data
    model.fit(X, y)
    # make a prediction for one example
    model_pred = model.predict_proba(test)[:,1]
    tescik = model.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, model_pred))
    print("F1 score ", f1)
    
    return roc_auc_score(test_labels, model_pred), model_pred

In [None]:
StackingScore1 = fun2(train_data1,test_data1)
StackingScore2 = fun2(train_data2,test_data2)


## 8. Boosting <a class="anchor" id="eigth-bullet"></a>

## 8.1 LightGBM <a class="anchor" id="nineth-bullet"></a>

In [None]:
import lightgbm as lgb
import re
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

def PlainLightGBM(train,test):
    
     # Extract the ids
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
   
    model = lgb.LGBMClassifier(class_weight ='balanced' )
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
    n_scores = cross_val_score(model, train, labels, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    
    model.fit(train,labels)
    model_pred =  model.predict_proba(test)[:, 1]
   
    tescik =  model.predict(test)
    f1 = f1_score(test_labels, tescik)
     
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, model_pred))
    print('F1 score: %f' % f1)
    print('Average F1 score: %f' % mean(n_scores))
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    return roc_auc_score(test_labels, model_pred), model_pred

    


In [None]:
LGBMScore1=PlainLightGBM(train_data1,test_data1)
LGBMScore2=PlainLightGBM(train_data2,test_data2)


## 8.2 Gradient Boosting  <a class="anchor" id="tenth-bullet"></a>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# make predictions using gradient boosting for classification
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import re

def myBooster(train,test):
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    # define dataset
    X = train
    y = labels
    # define the model
    model = GradientBoostingClassifier()
    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)
    n_scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    model.fit(X, y)
    # make a prediction for one example
    model_pred = model.predict_proba(test)[:,1]
    tescik = model.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, model_pred))
    print('F1 score: %f' % f1)
    print('Average F1 score: %f' % mean(n_scores))
    return roc_auc_score(test_labels, model_pred), model_pred

In [None]:
GBScore1=myBooster(train_data1,test_data1)
# GBScore2=myBooster(train_data2,test_data2)

## 8.3 Histogram Gradient Boosting  <a class="anchor" id="11-bullet"></a>

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

def myHistBooster(train, test):
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    # define dataset
    X = train
    y = labels
    model = HistGradientBoostingClassifier()
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    
    model.fit(X, y)
    model_pred = model.predict_proba(test)[:,1]
    tescik = model.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, model_pred))
    print('F1 score: %f' % f1)
    print('Average F1 score: %f' % mean(n_scores))
    return roc_auc_score(test_labels, model_pred), model_pred
    

In [None]:
HB1Score1 = myHistBooster(train_data1,test_data1)
HB1Score2 = myHistBooster(train_data2,test_data2)

## 9. Bagging <a class="anchor" id="12-bullet"></a>

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection 
from sklearn.ensemble import BaggingClassifier 
def bagging(train, test):
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    # Extract the labels for training
    labels = train['TARGET']
    test_labels = test['TARGET']
    # Remove the ids and target
    train = train.drop(columns = ['TARGET','SK_ID_CURR'])
    test= test.drop(columns = ['TARGET','SK_ID_CURR'])
    train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    
    # define dataset
    X = train
    y = labels
    seed = 1
   
    # initialize the base classifier 
    base_cls = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
    base_cls1 = BalancedRandomForestClassifier()
    weights = {0:0.9122, 1:0.0878}
    base_cls2 = LogisticRegression(C = 0.0001, class_weight=weights)
  
    # no. of base classifier 
    num_trees = 100
  
    # bagging classifier 
    model = BaggingClassifier(base_estimator = base_cls2, 
                          random_state = seed,
                             n_jobs=-1) 
    print("Fiting")
    model.fit(X,y)
    model_pred = model.predict_proba(test)[:,1]
    tescik = model.predict(test)
    f1 = f1_score(test_labels, tescik)
    cm = confusion_matrix(test_labels, tescik)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    plt.show()
    
    print('Train/Test split results:')
    print("ROC",  roc_auc_score(test_labels, model_pred)), model_pred
    print('F1 score: %f' % f1)
    
    return roc_auc_score(test_labels, model_pred), model_pred

In [None]:
BagginScore1=bagging(train_data1,test_data1)
BagginScore2=bagging(train_data2,test_data2)

## 10. Plotting <a class="anchor" id="13-bullet"></a>

In [None]:
from sklearn.metrics import roc_curve
def plotingROC(test_labels,test_labels1):
    fpr, tpr, thresh = roc_curve(test_labels, DTScore2[1], pos_label=1)
    fpr1, tpr1, thresh1 = roc_curve(test_labels, KNNScore2[1], pos_label=1)
    fpr2, tpr2, thresh2 = roc_curve(test_labels1, NBScore1[1], pos_label=1)
    fpr3, tpr3, thresh3 = roc_curve(test_labels, LRScore2[1], pos_label=1)
    
    
    
    random_probs = [0 for i in range(len(test_labels))]
    p_fpr, p_tpr,_  = roc_curve(test_labels, random_probs, pos_label=1)
    plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
    plt.plot(fpr, tpr, linestyle='--', color='orange', label='Decision Tree')
    plt.plot(fpr1, tpr1, linestyle='--', color='red', label='KNN')
    plt.plot(fpr2, tpr2, linestyle='--', color='green', label='Naive Bayes')
    plt.plot(fpr3, tpr3, linestyle='--', color='black', label='Logistic Regression')
    
  
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.show();
  

In [None]:
def plotingROC1(test_labels,test_labels1):
    fpr, tpr, thresh = roc_curve(test_labels, LGBMScore2[1], pos_label=1)
#     fpr1, tpr1, thresh1 = roc_curve(test_labels1, GBScore2[1], pos_label=1)
#     fpr2, tpr2, thresh2 = roc_curve(test_labels, BagginScore2[1], pos_label=1)
    fpr3, tpr3, thresh3 = roc_curve(test_labels, HB1Score2[1], pos_label=1)
    fpr4, tpr4, thresh4 = roc_curve(test_labels, StackingScore2[1], pos_label=1)
    
    
    
    random_probs = [0 for i in range(len(test_labels))]
    p_fpr, p_tpr,_  = roc_curve(test_labels, random_probs, pos_label=1)
    plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
    plt.plot(fpr, tpr, linestyle='--', color='orange', label='LightGBM')
#     plt.plot(fpr1, tpr1, linestyle='--', color='red', label='Gradient Boosting')
#     plt.plot(fpr2, tpr2, linestyle='--', color='green', label='Bagging')
    plt.plot(fpr3, tpr3, linestyle='--', color='black', label='Histogram Boosting')
    plt.plot(fpr4, tpr4, linestyle='--', color='pink', label='Stacking')
    
   
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.show();
    

In [None]:
plotingROC(test_data2["TARGET"],test_data1["TARGET"])

In [None]:
plotingROC1(test_data2["TARGET"],test_data1["TARGET"])

In [None]:
print("LGBM: %f" % LGBMScore2[0]);
print("Gradient: %f" % GBScore1[0]);
print("Bagging: %f" % BagginScore2[0]);
print("Histogram: %f" % HB1Score2[0]);
print("Stacking: %f" % StackingScore2[0]);


print("Decision Trees: %f" % DTScore2[0]);
print("KNN: %f" % KNNScore2[0]);
print("Logisitc Regression: %f" % LRScore2[0]);
print("NaiveBayes: %f" % NBScore1[0]);




