# Logistic Regression
### Logistic Regression implementation 
#### Import Requirements 

In [2]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

#memory management
import gc

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

#Logistic regression
from sklearn.linear_model import LogisticRegression as LR

#to measure ROC AUC performance
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold

### Set folder paths for getting input and saving outputs

In [3]:
#Set input data folder 
dataFolder = os.getcwd() + os.sep + os.pardir + os.sep + 'CodeOutputs'
if(not os.path.exists(dataFolder)):
    print("Input Data folder not found. Please specify data folder path as dataFolder variable to proceed")
    raise NotADirectoryError
    
#Create output folder is it does not exist
outputFolder = os.getcwd() + os.sep + os.pardir + os.sep + 'CodeOutputs'

if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)
    print('Output Folder created')

### Read input

In [4]:
def setup_input(filename, dataFolder):
    # Read Training data
    df = pd.read_csv(dataFolder + os.sep + filename)
    labels = df.pop('TARGET')
    if('Unnamed: 0' in df):
        df = df.drop(columns='Unnamed: 0')
    if('SK_ID_CURR' in df):
        df = df.drop(columns='SK_ID_CURR')
        
    return df, labels

In [5]:
def logistic_regression(df, labels):
    # Run classifier with cross-validation
    cv = StratifiedKFold(n_splits=10)
    foldScores = []

    for trainSet, testSet in cv.split(df, labels):
        clf = LR(C = 0.001)
        model = clf.fit(df.iloc[trainSet], labels.iloc[trainSet])
        probabilities = model.predict_proba(df.iloc[testSet])[:,1]
        foldScores.append(roc_auc_score(labels[testSet], probabilities))
        del clf
        
    return np.mean(foldScores)

In [7]:
file_names = ['DataSetVersion6_a.csv', 'DataSetVersion6_b.csv']#, 'DataSetVersion1_c.csv', 'DataSetVersion2_a.csv',
              #'DataSetVersion2_b.csv', 'DataSetVersion2_c.csv', 'DataSetVersion3_a.csv', 'DataSetVersion3_b.csv',
             #'DataSetVersion3_c.csv', 'DataSetVersion4_a.csv', 'DataSetVersion4_b.csv', 'DataSetVersion4_c.csv',
             #'DataSetVersion5_a.csv', 'DataSetVersion5_b.csv', 'DataSetVersion5_c.csv', 'DataSetVersion6_a.csv', 'DataSetVersion6_b.csv']

for filename in file_names:
    input_df, labels = setup_input(filename)
    try:
        print('File: {},  AUC score : {:0.3f}'.format(filename,logistic_regression(input_df, labels)))
        del input_df,labels
        gc.collect()
    except:
        print('Classifier could not run on the data set. Put X instead of score.')
        del input_df,labels
        gc.collect()
        continue

File: DataSetVersion6_a.csv,  AUC score : 0.724
File: DataSetVersion6_b.csv,  AUC score : 0.678


In [12]:
file_names = ['DataSetVersion6_a.csv', 'DataSetVersion6_b.csv']#, 'DataSetVersion1_c.csv', 'DataSetVersion2_a.csv',
              #'DataSetVersion2_b.csv', 'DataSetVersion2_c.csv', 'DataSetVersion3_a.csv', 'DataSetVersion3_b.csv',
             #'DataSetVersion3_c.csv', 'DataSetVersion4_a.csv', 'DataSetVersion4_b.csv', 'DataSetVersion4_c.csv',
             #'DataSetVersion5_a.csv', 'DataSetVersion5_b.csv', 'DataSetVersion5_c.csv', 'DataSetVersion6_a.csv', 'DataSetVersion6_b.csv']

for filename in file_names:
    input_df, labels = setup_input(filename)
    test_df, test_labels = setup_input('DataSetVersion4_b.csv')
    
    cv = StratifiedKFold(n_splits=10)
    foldScores = []
    clf = LR(C = 0.001)
    for trainSet, testSet in cv.split(input_df, labels):
        model = clf.fit(input_df.iloc[trainSet], labels.iloc[trainSet])
        probabilities = model.predict_proba(test_df)[:,1]
        foldScores.append(roc_auc_score(test_labels, probabilities))
        
    print('Test Score : ', str(np.mean(foldScores))) 
    del input_df,labels
    gc.collect()

Test Score :  0.7238668086700223
Test Score :  0.6848398763895931
