In [5]:
import pandas as pd
import os
import numpy as np

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [27]:
def getData():
    cwd = os.getcwd()  #getting the path of this current program
    filename = cwd + '/default of credit card clients.xls'  #path + file
    
    np.random.seed(0)
    #Read file into pandas dataframe 
    nanDict= {}
    df = pd.read_excel('default of credit card clients.xls', header=1, skiprows=0, index_col=0, na_values=nanDict)
    df.rename(index=str, columns={'default payment next month': 'defaultPaymentNextMonth'}, inplace=True)

    #Drop the rows including data where parameters are out of range
    df=df.drop(df[df.SEX<1].index)
    df=df.drop(df[df.SEX<2].index)
    df=df.drop(df[(df.EDUCATION <1)].index)
    df=df.drop(df[(df.EDUCATION >4)].index)
    df=df.drop(df[df.MARRIAGE<1].index)
    df=df.drop(df[df.MARRIAGE>3].index)


    #Features and targets
    #.values returns a numpy representation of the DataFrame
    X= df.loc[:, df.columns != 'defaultPaymentNextMonth'].values 
    y= df.loc[:, df.columns == 'defaultPaymentNextMonth'].values
    
    # Categorical variables to one-hot's
    onehotencoder = OneHotEncoder(categories="auto")

    #OneHot encoder for column 1,2,3 [sex,education,marriage]
    #Designmatrix
    X = ColumnTransformer(
    [('onehotencoder', onehotencoder, [1,2,3]),],
    remainder="passthrough").fit_transform(X)

    return X, np.ravel(y)

In [22]:
#test= np.where(X[:,1]<1, X[:,1], X[:,1] >2)[0]
#test
X= df.loc[:, df.columns != 'defaultPaymentNextMonth'].values
y= df.loc[:, df.columns == 'defaultPaymentNextMonth'].values


outlier_gender1 = np.where(X[:,1] < 1)[0]
outlier_gender2 = np.where(X[:,1] > 2)[0]

outlier_education1 = np.where(X[:,2] < 1)[0]
outlier_education2 = np.where(X[:,2] > 4)[0]

outlier_marital1 = np.where(X[:,3] < 1)[0]
outlier_marital2 = np.where(X[:,3] > 3)[0]

inds = np.concatenate((outlier_gender1,
                        outlier_gender2,
                        outlier_education1,
                        outlier_education2,
                        outlier_marital1,
                        outlier_marital2))

outlier_rows = np.unique(inds)
X = np.delete(X, outlier_rows, axis=0)
y = np.delete(y, outlier_rows, axis=0)

y

array([[1],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [10]:
#train-test split
trainingShare = 0.5 
seed  = 1
XTrain, XTest, yTrain, yTest=train_test_split(X, y, train_size=trainingShare,
                                              test_size = 1-trainingShare,
                                              random_state=seed)

# Input Scaling
sc = StandardScaler()
XTrain = sc.fit_transform(XTrain)
XTest = sc.transform(XTest)




In [11]:
# One-hot's of the target vector
Y_train_onehot, Y_test_onehot = onehotencoder.fit_transform(yTrain), onehotencoder.fit_transform(yTest)

"""from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

lambdas=np.logspace(-5,7,13)
parameters = [{'C': 1./lambdas, "solver":["lbfgs"]}]#*len(parameters)}]
scoring = ['accuracy', 'roc_auc']
logReg = LogisticRegression()
gridSearch = GridSearchCV(logReg, parameters, cv=5, scoring=scoring, refit='roc_auc')"""

## Logistic Regression Model Fitting

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
y_pred = logreg.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.80


In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89      4284
           1       0.00      0.00      0.00      1073

    accuracy                           0.80      5357
   macro avg       0.40      0.50      0.44      5357
weighted avg       0.64      0.80      0.71      5357

