In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # Seaborn visualization library
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split, RandomizedSearchCV, GridSearchCV
sns.set(style="darkgrid")

import os

%matplotlib inline

import gc

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [3]:
# Imports for Modeling

#from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, auc, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [4]:
# Target variable from the Training Set
Target = train['target']

In [5]:
# Input dataset for Train and Test 
train_inp = train.drop(columns = ['target', 'ID_code'])
test_inp = test.drop(columns = ['ID_code'])

In [6]:
# List of feature names
features = list(train_inp.columns)

In [7]:
# Split the Train Dataset into training and validation sets for model building. 
# The training set now has 140K records and validation set has 60K records

X_train, X_test, Y_train, Y_test = train_test_split(train_inp, Target, 
                                                    test_size= 0.3, random_state = 2020)

## Model 1

In [None]:
# Instantiate Classifier: logreg
logreg = LogisticRegression()

# fit the model

logreg.fit(X_train, Y_train)
# Predicting values on training itself
pred_train_y = logreg.predict(X_train)
y_pred = logreg.predict(X_test)


target_names = ['class 0', 'class 1']

print(classification_report(Y_train, pred_train_y, target_names=target_names))
print(classification_report(Y_test,  Y_pred, target_names=target_names))

print(pd.DataFrame(confusion_matrix(Y_test, y_pred),
                 columns=['pred_class_0', 'pred_class_1'], index=['class_0', 'class_1']))
print(roc_auc_score(Y_test, y_pred))




## Model 2

In [None]:
# Instantiate Classifier: logreg1
logreg2=LogisticRegression(C=0.001, class_weight='balanced')

# Fit the training data on this object
logreg2.fit(X_train, Y_train)

In [None]:
# Predicting values on training itself
pred_train_y = logreg2.predict_proba(x_train)[:,1]
y_pred = logreg2.predict_proba(x_test)[:,1]

In [None]:
def performance(Y_test, y_pred):
    logist_pred_var = [0 if i < 0.5 else 1 for i in y_pred]
    print('Confusion Matrix:')
    print(confusion_matrix(Y_test, logist_pred_var)) 
        
    #print(classification_report(Y_test, logist_pred)) 

    fpr, tpr, thresholds = roc_curve(Y_test, y_pred, pos_label=1)
    print('AUC:')
    print(auc(fpr, tpr))

In [None]:
performance(Y_test, y_pred)

In [None]:
# Submission dataframe
logist_pred_test = logreg2.predict_proba(test_inp)[:,1]

submit = test[['ID_code']]
submit['target'] = logist_pred_test

submit.head()

In [None]:
# Create the Submission File using logistic regression model
submit.to_csv('log_reg_test.csv', index = False)

## Model 3

In [None]:
# Instantiate Classifier: logreg3
logreg3=LogisticRegression(C=1, class_weight='balanced', penalty='l2')

# Fit the training data on this object
logreg3.fit(X_train, Y_train)

In [None]:
# Predicting values on training itself
pred_train_y3 = logreg3.predict_proba(x_train)[:,1]
y_pred3 = logreg3.predict_proba(x_test)[:,1]

In [None]:
performance(Y_test, y_pred3)

In [None]:
logreg3