# SGDClassifier(LinearSVC) 
### Linear SVC  implementation with SGD Classifier to speed up training 
#### Import Requirements 

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
#from pandas.tools.plotting import table

#memory management
import gc

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

#SGD Classifier
from sklearn.linear_model import SGDClassifier

#to measure ROC AUC performance
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedKFold

### Set folder paths for getting input and saving outputs

In [2]:
#Set input data folder 
dataFolder = os.getcwd()+os.sep+os.pardir+os.sep + 'CodeOutputs'
if(not os.path.exists(dataFolder)):
    print("Input Data folder not found. Please specify data folder path as dataFolder variable to proceed")
    raise NotADirectoryError
    
#Create output folder is it does not exist
outputFolder = os.getcwd()+os.sep+os.pardir+os.sep + 'CodeOutputs'

if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)
    print('Output Folder created')

### Read input

In [3]:
def setup_input(filename):
    # Read Training data
    df = pd.read_csv(dataFolder + os.sep + filename)
    labels = df.pop('TARGET')
    if('Unnamed: 0' in df):
        df = df.drop(columns='Unnamed: 0')
    
    df = df.drop(columns='SK_ID_CURR')
    return df, labels

In [4]:
def svclassifier(df, labels):
    # Run classifier with 10 fold cross-validation
    cv = StratifiedKFold(n_splits=10)
    probas_ = np.zeros(df.shape[0])

    for trainSet, testSet in cv.split(df, labels):
        clf = SGDClassifier(alpha=5.5, class_weight='balanced', 
                            loss='hinge', max_iter=1000, n_jobs=-1)
        model = clf.fit(df.iloc[trainSet], labels.iloc[trainSet])
        probas_[testSet] = np.array(clf.decision_function(df.iloc[testSet]))
        del clf
    
    return roc_auc_score(labels, probas_)

In [5]:
file_names = ['DataSetVersion1_a.csv', 'DataSetVersion1_b.csv', 'DataSetVersion1_c.csv', 'DataSetVersion2_a.csv',
              'DataSetVersion2_b.csv', 'DataSetVersion2_c.csv', 'DataSetVersion3_a.csv', 'DataSetVersion3_b.csv',
             'DataSetVersion3_c.csv']

for filename in file_names:
    input_df, labels = setup_input(filename)
    try:
        print('File: {},  AUC score : {:0.3f}'.format(filename,svclassifier(input_df, labels)))
        del input_df,labels
        gc.collect()
    except:
        print('Classifier could not run on the data set. Put X instead of score.')
        del input_df,labels
        gc.collect()
        continue

Classifier could not run on the data set. Put X instead of score.
Classifier could not run on the data set. Put X instead of score.
Classifier could not run on the data set. Put X instead of score.
File: DataSetVersion2_a.csv,  AUC score : 0.536
File: DataSetVersion2_b.csv,  AUC score : 0.542
File: DataSetVersion2_c.csv,  AUC score : 0.528
File: DataSetVersion3_a.csv,  AUC score : 0.533
File: DataSetVersion3_b.csv,  AUC score : 0.529
File: DataSetVersion3_c.csv,  AUC score : 0.525


In [None]:
file_names = ['DataSetVersion4_a.csv', 'DataSetVersion4_b.csv', 'DataSetVersion4_c.csv']

for filename in file_names:
    input_df, labels = setup_input(filename)
    try:
        print('File: {},  AUC score : {:0.3f}'.format(filename,svclassifier(input_df, labels)))
        del input_df,labels
        gc.collect()
    except:
        print('Classifier could not run on the data set. Put X instead of score.')
        del input_df,labels
        gc.collect()
        continue

File: DataSetVersion4_a.csv,  AUC score : 0.527
