# Logistic Regression

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [50]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)

In [42]:
clf.predict(X[:2, :])

array([0, 0])

In [43]:
clf.predict_proba(X[:2, :])

array([[8.78030305e-01, 1.21958900e-01, 1.07949250e-05],
       [7.97058292e-01, 2.02911413e-01, 3.02949242e-05]])

In [44]:
clf.score(X, y)

0.96

# Confusion Matrix

In [45]:
from sklearn.metrics import confusion_matrix

In [46]:
confusion_matrix(y, clf.predict(X))

array([[50,  0,  0],
       [ 0, 45,  5],
       [ 0,  1, 49]], dtype=int64)

In [22]:
confusion_matrix(y, clf.predict(X), normalize='true')

array([[1.  , 0.  , 0.  ],
       [0.  , 0.94, 0.06],
       [0.  , 0.02, 0.98]])

In [None]:
i

# 1. Solve classification problem using 'classification.csv' dataset

##### target variable is 'default'. Apply feature selection, feature scaling, cross validation etc. (anything you think is needed)

In [88]:
import pandas as pd

In [89]:
data=pd.read_csv('classification.csv')

In [90]:
data.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,college degree,17,12,176,9.3,11.359392,5.008608,1
1,27,no high school,10,6,31,17.3,1.362202,4.000798,0
2,40,no high school,15,14,55,5.5,0.856075,2.168925,0
3,41,no high school,15,14,120,2.9,2.65872,0.82128,0
4,24,high school,2,0,28,17.3,1.787436,3.056564,1


In [91]:
data.corr()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default
age,1.0,0.536497,0.597591,0.47871,0.016398,0.295207,0.340217,-0.137657
employ,0.536497,1.0,0.322334,0.619681,-0.031182,0.403694,0.406091,-0.282978
address,0.597591,0.322334,1.0,0.316245,0.011323,0.208435,0.226514,-0.164451
income,0.47871,0.619681,0.316245,1.0,-0.026777,0.570199,0.610659,-0.07097
debtinc,0.016398,-0.031182,0.011323,-0.026777,1.0,0.501767,0.58487,0.389575
creddebt,0.295207,0.403694,0.208435,0.570199,0.501767,1.0,0.633104,0.24474
othdebt,0.340217,0.406091,0.226514,0.610659,0.58487,0.633104,1.0,0.145713
default,-0.137657,-0.282978,-0.164451,-0.07097,0.389575,0.24474,0.145713,1.0


In [14]:
len(data)

700

In [92]:
print(list(data.columns))

['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt', 'default']


In [93]:
#data1=pd.get_dummies(data[['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt']])
data1=pd.get_dummies(data.drop('default',axis=1))

In [94]:
data1.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate
0,41,17,12,176,9.3,11.359392,5.008608,1,0,0,0,0
1,27,10,6,31,17.3,1.362202,4.000798,0,0,1,0,0
2,40,15,14,55,5.5,0.856075,2.168925,0,0,1,0,0
3,41,15,14,120,2.9,2.65872,0.82128,0,0,1,0,0
4,24,2,0,28,17.3,1.787436,3.056564,0,1,0,0,0


In [95]:
from sklearn.preprocessing import MinMaxScaler
norm=MinMaxScaler().fit(data1)
data2=pd.DataFrame(columns=data1.columns, data=norm.transform(data1))

In [96]:
data2.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate
0,0.583333,0.548387,0.352941,0.375,0.217604,0.55221,0.183897,1.0,0.0,0.0,0.0,0.0
1,0.194444,0.322581,0.176471,0.039352,0.413203,0.065719,0.146554,0.0,0.0,1.0,0.0,0.0
2,0.555556,0.483871,0.411765,0.094907,0.124694,0.04109,0.078677,0.0,0.0,1.0,0.0,0.0
3,0.583333,0.483871,0.411765,0.24537,0.061125,0.128811,0.028742,0.0,0.0,1.0,0.0,0.0
4,0.111111,0.064516,0.0,0.032407,0.413203,0.086412,0.111567,0.0,1.0,0.0,0.0,0.0


In [97]:
target=data['default']
data2

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate
0,0.583333,0.548387,0.352941,0.375000,0.217604,0.552210,0.183897,1.0,0.0,0.0,0.0,0.0
1,0.194444,0.322581,0.176471,0.039352,0.413203,0.065719,0.146554,0.0,0.0,1.0,0.0,0.0
2,0.555556,0.483871,0.411765,0.094907,0.124694,0.041090,0.078677,0.0,0.0,1.0,0.0,0.0
3,0.583333,0.483871,0.411765,0.245370,0.061125,0.128811,0.028742,0.0,0.0,1.0,0.0,0.0
4,0.111111,0.064516,0.000000,0.032407,0.413203,0.086412,0.111567,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.444444,0.193548,0.441176,0.030093,0.102689,0.012183,0.034621,0.0,1.0,0.0,0.0,0.0
696,0.250000,0.193548,0.117647,0.016204,0.271394,0.017411,0.074104,0.0,1.0,0.0,0.0,0.0
697,0.361111,0.483871,0.088235,0.041667,0.176039,0.023337,0.070222,0.0,0.0,1.0,0.0,0.0
698,0.694444,0.612903,0.647059,0.145833,0.195599,0.111482,0.152653,0.0,0.0,1.0,0.0,0.0


In [98]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

In [105]:
xTrain,xTest,yTrain,yTest=train_test_split(data2,target,random_state=0,test_size=0.2)
clf = LogisticRegression(random_state=0).fit(xTrain, yTrain)
clf.score(data2, target)

0.7942857142857143

In [100]:
accuracy_score(yTrain,clf.predict(xTrain))

0.8035714285714286

In [101]:
accuracy_score(yTest,clf.predict(xTest))

0.7571428571428571

In [111]:
?accuracy_score

In [110]:
?clf.score

## Using KFold

In [116]:
kf = KFold(n_splits=10) 
score_list=[]

for train_index, test_index in kf.split(data2):
    #print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = data2.iloc[train_index], data2.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    clf.fit(X_train, y_train)
    score_list.append(clf.score(data2, target))
print("score list mean" ,np.mean(test_list))
print(score_list)

score list mean 0.7885714285714286
[0.7971428571428572, 0.7957142857142857, 0.79, 0.7942857142857143, 0.7957142857142857, 0.7928571428571428, 0.7914285714285715, 0.7942857142857143, 0.7971428571428572, 0.7971428571428572]


In [121]:
kf = KFold(n_splits=10) 
test_list=[]
train_list=[]
for train_index, test_index in kf.split(data2):
    #print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = data2.iloc[train_index], data2.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    clf.fit(X_train, y_train)
    predictions1 = clf.predict(X_test)
    predictions2 = clf.predict(X_train)
    test_list.append(accuracy_score(y_test,predictions1))
    train_list.append(accuracy_score(y_train,predictions2))
print("accuracy_score test" ,np.mean(test_list))
print("accuracy_score train" ,np.mean(train_list))

accuracy_score test 0.7885714285714286
accuracy_score train 0.7952380952380953


In [87]:
confusion_matrix(target, clf.predict(data2))

array([[493,  24],
       [118,  65]], dtype=int64)

# 2. Print accuracy, confusion matrix, precision, recall, sensitivity and specifity on train and test (and maybe validation) datasets.

##### do not use any libraries for metrics, implement yourself

In [129]:
xTrain.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate
45,0.027778,0.0,0.029412,0.00463,0.156479,0.006737,0.033062,0.0,1.0,0.0,0.0,0.0
285,0.416667,0.290323,0.470588,0.037037,0.176039,0.031052,0.058716,0.0,1.0,0.0,0.0,0.0
62,0.472222,0.129032,0.0,0.020833,0.293399,0.041761,0.071756,0.0,0.0,1.0,0.0,0.0
386,0.166667,0.225806,0.088235,0.018519,0.114914,0.009586,0.032152,0.0,0.0,1.0,0.0,0.0
668,0.416667,0.225806,0.411765,0.034722,0.110024,0.012293,0.04117,0.0,0.0,1.0,0.0,0.0


In [143]:
pred_train=clf.predict(xTrain)

In [144]:
actual_train=np.array(yTrain)

In [153]:
def measure(actual, pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(pred)): 
        if actual[i]==pred[i]==0:
           TP += 1
        if pred[i]==0 and actual[i]!=pred[i]:
           FP += 1
        if actual[i]==pred[i]==1:
           TN += 1
        if pred[i]==1 and actual[i]!=pred[i]:
           FN += 1

    print('TP:',TP,'FP:',FP,    'TN:',TN,   'FN:', FN)

In [173]:
def score(TP,FP,TN,FN):
    sensitivity=TP/(TP+FN)
    specifity=TN/(TN+FP)
    precision=TP/(TP+FP)
    negative_precision=TN/(FN+TN)
    accuracy=(TN+TP)/(TN+TP+FN+FP)
    print('sensitivity',sensitivity)
    print('specifity',specifity)
    print('precision',precision)
    print('negative_precision',negative_precision)
    print('accuracy',accuracy)
    
    

In [205]:
def matrix(TP,FN,FP,TN):
    a=np.array([TP,FN,FP,TN])
    print(a.reshape(2,2))

### train datasets

In [206]:
measure(actual_train,pred_train)

TP: 396 FP: 92 TN: 52 FN: 20


In [207]:
score(396,92,52,20)

sensitivity 0.9519230769230769
specifity 0.3611111111111111
precision 0.8114754098360656
negative_precision 0.7222222222222222
accuracy 0.8


In [208]:
matrix(396,92,52,20)

[[396  92]
 [ 52  20]]


### test datasets

In [209]:
actual_test=np.array(yTrain)
pred_test=clf.predict(xTest)

In [210]:
measure(actual_test,pred_test)

TP: 92 FP: 31 TN: 4 FN: 13


In [211]:
score(92,31,4,13)

sensitivity 0.8761904761904762
specifity 0.11428571428571428
precision 0.7479674796747967
negative_precision 0.23529411764705882
accuracy 0.6857142857142857


In [212]:
matrix(92,31,4,13)

[[92 31]
 [ 4 13]]
