# Logistic Regression

In [81]:
import warnings
warnings.filterwarnings('ignore')

In [45]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)

In [48]:
clf.predict(X[:2, :])

array([0, 0])

In [59]:
for x in clf.predict_proba(X[:2, :]):
    print([round(y,2) for y in x])

[0.98, 0.02, 0.0]
[0.97, 0.03, 0.0]


In [50]:
clf.score(X, y)

0.9733333333333334

# Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix

In [24]:
confusion_matrix(y, clf.predict(X))

array([[50,  0,  0],
       [ 0, 47,  3],
       [ 0,  1, 49]], dtype=int64)

In [22]:
confusion_matrix(y, clf.predict(X), normalize='true')

array([[1.  , 0.  , 0.  ],
       [0.  , 0.94, 0.06],
       [0.  , 0.02, 0.98]])

# 1. Solve classification problem using 'classification.csv' dataset

#### target variable is 'default'. Apply feature selection, feature scaling, cross validation etc. (anything you think is needed)

In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression


In [113]:
df = pd.read_csv('classification.csv')
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,college degree,17,12,176,9.3,11.359392,5.008608,1
1,27,no high school,10,6,31,17.3,1.362202,4.000798,0
2,40,no high school,15,14,55,5.5,0.856075,2.168925,0
3,41,no high school,15,14,120,2.9,2.65872,0.82128,0
4,24,high school,2,0,28,17.3,1.787436,3.056564,1


In [114]:
len(df)

700

In [331]:
df['ed'].unique()

array(['college degree', 'no high school', 'high school', 'undergraduate',
       'postgraduate'], dtype=object)

In [132]:
X=pd.get_dummies(df[df.columns.difference(['default'])])
y=df['default']
y.value_counts()

0    517
1    183
Name: default, dtype: int64

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(560, 12) (560,)
(140, 12) (140,)


In [582]:
logreg = LogisticRegression(random_state=0)
clf=logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [583]:
for x in logreg.predict_proba(X_test[:5]):
    print([round(y,2) for y in x])

[1.0, 0.0]
[0.91, 0.09]
[0.95, 0.05]
[0.66, 0.34]
[0.87, 0.13]


In [584]:
y_pred[:5]

array([0, 0, 0, 0, 0])

In [585]:
clf.score(X_test,y_test)

0.8142857142857143

In [586]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix

array([[96,  7],
       [19, 18]])

In [587]:
cnf_matrix_train = confusion_matrix(y_train, y_pred_train)
cnf_matrix_train

array([[387,  27],
       [ 74,  72]])

In [588]:
confusion_matrix(y_test, y_pred, normalize='true')

array([[0.93203883, 0.06796117],
       [0.51351351, 0.48648649]])

#### Cross Validation

In [589]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
cv_accuracy=[]
cv_precision=[]
cv_recall=[]

for train_index, test_index in kf.split(X_train):
    print('TRAIN:', train_index, 'TEST:', test_index)
    xTrain, xTest = X_train.iloc[train_index], X_train.iloc[test_index]    
    yTrain, yTest = y_train.iloc[train_index], y_train.iloc[test_index]
    
    logreg = LogisticRegression(random_state=0).fit(xTrain,yTrain)
    y_pred_cv=logreg.predict(xTest)

    cv_accuracy.append(logreg.score(xTest,yTest))
    cv_precision.append(metrics.precision_score(yTest, y_pred_cv))
    cv_recall.append(metrics.recall_score(yTest, y_pred_cv))

TRAIN: [ 56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
 290 291 292 293 294 295 296 297 298 299 300

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 280 281 282 283 284 285 286 287 288 289
 290 291 292 293 294 295 296 297 298 299 300

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244

In [590]:
print("Avg accuracy:", np.mean(cv_accuracy))
print("Avg precision:", np.mean(cv_precision))
print("Avg recall:", np.mean(cv_recall))

Avg accuracy: 0.8035714285714285
Avg precision: 0.6914277389277389
Avg recall: 0.4910409803773648


In [591]:
clf=logreg.fit(X_train,y_train)
print("Accuracy:", clf.score(X_test,y_test))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8142857142857143
Precision: 0.72
Recall: 0.4864864864864865


In [592]:
df.groupby('default').corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,employ,address,income,debtinc,creddebt,othdebt
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,age,1.0,0.532608,0.587204,0.518088,0.043784,0.33962,0.365117
0,employ,0.532608,1.0,0.284661,0.665022,0.019285,0.458617,0.459768
0,address,0.587204,0.284661,1.0,0.306909,0.047191,0.228394,0.232651
0,income,0.518088,0.665022,0.306909,1.0,-0.038482,0.535743,0.602849
0,debtinc,0.043784,0.019285,0.047191,-0.038482,1.0,0.491428,0.608517
0,creddebt,0.33962,0.458617,0.228394,0.535743,0.491428,1.0,0.565681
0,othdebt,0.365117,0.459768,0.232651,0.602849,0.608517,0.565681,1.0
1,age,1.0,0.512361,0.608171,0.388921,0.138336,0.393424,0.383516
1,employ,0.512361,1.0,0.319178,0.562821,0.268381,0.741251,0.556945
1,address,0.608171,0.319178,1.0,0.3345,0.175797,0.38392,0.340049


In [593]:
df.corr()['default']

age        -0.137657
employ     -0.282978
address    -0.164451
income     -0.070970
debtinc     0.389575
creddebt    0.244740
othdebt     0.145713
default     1.000000
Name: default, dtype: float64

In [594]:
#there is a strong relationship between income, creddebt and otherdebt, I want to try to remove income, since it is the least correlated feature with the target variable

In [595]:
X=pd.get_dummies(df[df.columns.difference(['default', 'income', 'age'])])
y=df['default']
y.value_counts()

0    517
1    183
Name: default, dtype: int64

In [596]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(560, 10) (560,)
(140, 10) (140,)


In [597]:
logreg = LogisticRegression(random_state=0)
clf=logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [598]:
confusion_matrix(y_test, y_pred)

array([[96,  4],
       [24, 16]])

In [599]:
confusion_matrix(y_test, y_pred, normalize='true')

array([[0.96, 0.04],
       [0.6 , 0.4 ]])

In [600]:
print("Accuracy:", clf.score(X_test,y_test))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8
Precision: 0.8
Recall: 0.4


In [601]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
cv_accuracy=[]
cv_precision=[]
cv_recall=[]

for train_index, test_index in kf.split(X_train):
    print('TRAIN:', train_index, 'TEST:', test_index)
    xTrain, xTest = X_train.iloc[train_index], X_train.iloc[test_index]    
    yTrain, yTest = y_train.iloc[train_index], y_train.iloc[test_index]
    
    logreg = LogisticRegression(random_state=0).fit(xTrain,yTrain)
    y_pred_cv=logreg.predict(xTest)

    cv_accuracy.append(logreg.score(xTest,yTest))
    cv_precision.append(metrics.precision_score(yTest, y_pred_cv))
    cv_recall.append(metrics.recall_score(yTest, y_pred_cv))

TRAIN: [ 56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
 290 291 292 293 294 295 296 297 298 299 300

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244

In [602]:
print("Avg accuracy:", np.mean(cv_accuracy))
print("Avg precision:", np.mean(cv_precision))
print("Avg recall:", np.mean(cv_recall))

Avg accuracy: 0.8142857142857144
Avg precision: 0.6850471097529921
Avg recall: 0.5188911088911088


# 2. Print accuracy, confusion matrix, precision, recall, sensitivity and specifity on train and test (and maybe validation) datasets.

##### do not use any libraries for metrics, implement yourself

In [603]:
print(y_test.shape,y_pred.shape)

(140,) (140,)


In [604]:
type(y_test)
y_test=np.asarray(y_test)
y_pred=np.asarray(y_pred)

In [619]:
tp = 0
fp = 0
tn = 0
fn = 0

for i in range(len(y_pred)): 
    if y_test[i]==y_pred[i]==1:
        tp += 1
    elif y_pred[i]==1 and y_test[i]!=y_pred[i]:
        fp += 1
    elif y_test[i]==y_pred[i]==0:
        tn += 1
    elif y_pred[i]==0 and y_test[i]!=y_pred[i]:
        fn += 1
print('True Positive:', tp, '\n' 'False Positive:', fp, '\n' 'True Negative:', tn, '\n' 'False Negative:', fn)

test_conf_matrix=np.array([[tp,fn],[fp,tn]])
print('\n' 'Confusion matrix on test \n', test_conf_matrix)

True Positive: 16 
False Positive: 4 
True Negative: 96 
False Negative: 24

Confusion matrix on test 
 [[16 24]
 [ 4 96]]


In [606]:
Precision = tp/(tp+fp)
Recall = tp/(tp+fn)
Sensitivity = tp/(tp+fn)
Specificity = tn/(tn+fp)
Accuracy = (tp+tn)/(tp+fp+tn+fn)

print('Accuracy',Accuracy, '\n' 'Precision:', Precision, '\n' 'Recall:', Recall, '\n' 'Sensitivity:', Sensitivity, '\n' 'Specificity:', Specificity)

Accuracy 0.8 
Precision: 0.8 
Recall: 0.4 
Sensitivity: 0.4 
Specificity: 0.96


In [624]:
clf=logreg.fit(X_train,y_train)
y_pred_train=clf.predict(X_train)
y_train=np.asarray(y_train)
y_pred_train=np.asarray(y_pred_train)

In [625]:
tp_train = 0
fp_train = 0
tn_train = 0
fn_train = 0

for i in range(len(y_pred_train)): 
    if y_train[i]==y_pred_train[i]==1:
        tp_train += 1
    elif y_pred_train[i]==1 and y_train[i]!=y_pred_train[i]:
        fp_train += 1
    elif y_train[i]==y_pred_train[i]==0:
        tn_train += 1
    elif y_pred_train[i]==0 and y_train[i]!=y_pred_train[i]:
        fn_train += 1
print('True Positive:', tp_train, '\n' 'False Positive:', fp_train, '\n' 'True Negative:', tn_train, '\n' 'False Negative:', fn_train)

train_conf_matrix=np.array([[tp_train,fn_train],[fp_train,tn_train]])
print('\n' 'Confusion matrix on train \n', train_conf_matrix)

True Positive: 74 
False Positive: 32 
True Negative: 385 
False Negative: 69

Confusion matrix on train 
 [[ 74  69]
 [ 32 385]]


In [626]:
Precision_train = tp_train/(tp_train+fp_train)
Recall_train = tp_train/(tp_train+fn_train)
Sensitivity_train = tp_train/(tp_train+fn_train)
Specificity_train = tn_train/(tn_train+fp_train)
Accuracy_train = (tp_train+tn_train)/(tp_train+fp_train+tn_train+fn_train)

print('Accuracy on train',Accuracy_train, '\n' 'Precision on train:', Precision_train, '\n' 'Recall on train:', Recall_train, '\n' 'Sensitivity on train:', Sensitivity_train, '\n' 'Specificity on train:', Specificity_train)

Accuracy on train 0.8196428571428571 
Precision on train: 0.6981132075471698 
Recall on train: 0.5174825174825175 
Sensitivity on train: 0.5174825174825175 
Specificity on train: 0.9232613908872902
