In [1]:
#import functions
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
#Data load - loading training and testing data set
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.shape,test.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
#loading training data set
data = pd.read_csv('train.csv', header=0)
data = data.dropna()
print(data.shape)
print(list(data.columns))

In [None]:
#viewing the training data
data.head()

In [None]:
#To get unique values in every field
data['EducationLevel'].unique()

In [None]:
#Bifurcation of Approved field
data['Approved'].value_counts(normalize=1)

In [None]:
#Graphical representation of people getting credit card approval and rejected
sns.countplot(x='Approved',data=data, palette='hls')
plt.show()

In [None]:
#To check whether each class are of same volume
count_no_sub = len(data[data['Approved']=='no'])
count_sub = len(data[data['Approved']=='yes'])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of rejected is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of approved", pct_of_sub*100)

In [None]:
#Approved field relationship with all continuous variables
data.groupby('Approved').mean()

In [None]:
#Checking whetther gender has any relationship with the rest of the continuous field
data.groupby('Male').mean()

In [None]:
#Creating dummy variables for categorical field for training data
cat_vars=['Male', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1=data.join(cat_list)
    data=data1
    
cat_vars=['Male', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']
data_vars=data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]
data_final=data[to_keep]
data_final.columns.values

In [None]:
#Creating dummy variables for categorical field for testing data
cat_vars=['Male', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(test[var], prefix=var)
    data1_test=test.join(cat_list)
    test=data1_test
    
cat_vars=['Male', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']
data_vars_test=test.columns.values.tolist()
to_keep=[i for i in data_vars_test if i not in cat_vars]
data_final_test=test[to_keep]
data_final_test.columns.values

In [None]:
#creating dependent and independent variables
X = data_final.loc[:, data_final.columns != 'Approved']
y = data_final.loc[:, data_final.columns == 'Approved']

In [None]:
#since the classes are not balanced. Creating traning data with equal volumes from each class
#Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['Approved'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of rejected in oversampled data",len(os_data_y[os_data_y['Approved']=='no']))
print("Number of approved",len(os_data_y[os_data_y['Approved']=='yes']))
print("Proportion of rejected data in oversampled data is ",len(os_data_y[os_data_y['Approved']=='no'])/len(os_data_X))
print("Proportion of approved data in oversampled data is ",len(os_data_y[os_data_y['Approved']=='yes'])/len(os_data_X))

In [None]:
#Defining dependent and independent variable
data_final_vars=data_final.columns.values.tolist()
y=['Approved']
X=[i for i in data_final_vars if i not in y]

In [None]:
#Recursive Feature Elimination
#The goal of RFE is to select features by recursively considering smaller and smaller sets of features
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

In [None]:
#feature extracted
os_data_X.columns[rfe.support_]

In [None]:
test['Married_nol'] =0
test['Citizen_p']=0
test['Ethnicity_z']=0
test['BankCustomer_no'] =0

In [None]:
cols=['Married_nol', 'Married_u', 'Married_y', 'BankCustomer_g',
       'BankCustomer_no', 'BankCustomer_p', 'EducationLevel_aa',
       'EducationLevel_c', 'EducationLevel_ff', 'EducationLevel_i',
       'EducationLevel_k', 'EducationLevel_m', 'Ethnicity_z', 'PriorDefault_f',
       'PriorDefault_t', 'Employed_f', 'Employed_t', 'DriversLicense_f',
       'Citizen_g', 'Citizen_p']
X=os_data_X[cols]
x_test = test[cols]
y=os_data_y['Approved']

In [None]:
y=np.where(y =='no', 0, y)
y=np.where(y =='yes', 1, y)

In [None]:
temp = pd.DataFrame(y,columns=['Approved'])

In [None]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2

# contingency table
#print(pd.crosstab(X.Married_y,temp.Approved))
stat, p, dof, expected = chi2_contingency(pd.crosstab(X.Citizen_p,temp.Approved))
print('\ndof=%d' % dof)
#print("\nExpected: ",expected)

# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('\nprobability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
    print('\nDependent (reject H0)')
else:
    print('\nIndependent (fail to reject H0)')
    
# interpret p-value
alpha = 1.0 - prob
print('\nsignificance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('\nDependent (reject H0)')
else:
	print('\nIndependent (fail to reject H0)')

In [None]:
#Implementing the model

import os
os.environ["http_proxy"] = "http://proxy.ebiz.verizon.com:80/"
os.environ["https_proxy"] = "http://proxy.ebiz.verizon.com:80/"
os.environ["no_proxy"] = "localhost,127.0.0.1,localaddress,.localdomain.com,verizon.com,*.verizon.com,169.254.169.254,10.0.2.2"

import statsmodels.api as sm
logit_model=sm.Logit(pd.DataFrame(y,columns=['Approved']).astype(float),X.astype(float))
result=logit_model.fit(method='bfgs')
print(result.summary2())

In [None]:
cols=['Married_u', 'Married_y', 'BankCustomer_g',
       'BankCustomer_p', 'EducationLevel_ff', 'EducationLevel_i',
       'EducationLevel_k', 'PriorDefault_f',
       'PriorDefault_t', 'Employed_f', 'Employed_t']
X=os_data_X[cols]
x_test = test[cols]

y=os_data_y['Approved']
y=np.where(y =='no', 0, y)
y=np.where(y =='yes', 1, y)
logit_model=sm.Logit(pd.DataFrame(y,columns=['Approved']).astype(float),X.astype(float))
result=logit_model.fit(method='bfgs')
print(result.summary2())

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn import preprocessing
import scipy as sp
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
#simple vector classifier
classifier=SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovo', random_state=200)
classifier.fit(X_train, y_train)
predictionsLR = classifier.predict(X_train)
print("Train Accuracy :",np.sum(y_train==predictionsLR)/len(y_train))
predictionsLR = classifier.predict(X_test)
print("Test Accuracy :",np.sum(y_test==predictionsLR)/len(y_test))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, predictionsLR)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, predictionsLR ,average=None)
print('Precision:', precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, predictionsLR,average=None)
print('Recall: ', recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, predictionsLR, average=None)
print('F1 score: ',  f1)

In [None]:
temp = pd.DataFrame(classifier.predict(x_test),columns=['Approved'])

In [None]:
#Logistic Regression Model Fitting
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y.astype('int'), test_size=0.3, random_state=0)
logreg = LogisticRegression(penalty='l2',C=10)
logreg.fit(X_train, y_train)

In [None]:
#Predicting the test set results and calculating the accuracy
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_train, y_train)))

In [None]:
temp = pd.DataFrame(logreg.predict(x_test),columns=['Approved'])

In [None]:
temp.to_csv("test_output.csv")

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
#Compute precision, recall, F-measure and support
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()