In [55]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import pandas as pd
import numpy as np

def count_tp(labels):
    true_count = 0
    false_count = 0
    for label in labels:
        if label == 1:
            true_count += 1
        if label == 0:
            false_count += 1

    return true_count, false_count

df = pd.read_csv('classification.csv')
x = df.iloc[:, 5:7].to_numpy()
#y = df.iloc[:, -1].to_numpy()
y = df.iloc[:, 8:9]
y=np.ravel(y)

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)

model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('acc:', accuracy_score(y_test, y_pred))
print('f1:', f1_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred))
print('precision:', precision_score(y_test, y_pred))
print('confusion: ', confusion_matrix(y_test, y_pred))

acc: 0.7642857142857142
f1: 0.3529411764705882
recall: 0.23076923076923078
precision: 0.75
confusion:  [[98  3]
 [30  9]]


In [30]:
### 1) start with one-hot encoding
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,college degree,17,12,176,9.3,11.359392,5.008608,1
1,27,no high school,10,6,31,17.3,1.362202,4.000798,0
2,40,no high school,15,14,55,5.5,0.856075,2.168925,0
3,41,no high school,15,14,120,2.9,2.65872,0.82128,0
4,24,high school,2,0,28,17.3,1.787436,3.056564,1


In [31]:
df=df.replace(' ', '_', regex=True)

In [32]:
y = pd.get_dummies(df.ed, prefix='ed')
print(y.head())
type(y)
y.dtypes

   ed_college_degree  ed_high_school  ed_no_high_school  ed_postgraduate  \
0                  1               0                  0                0   
1                  0               0                  1                0   
2                  0               0                  1                0   
3                  0               0                  1                0   
4                  0               1                  0                0   

   ed_undergraduate  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  


ed_college_degree    uint8
ed_high_school       uint8
ed_no_high_school    uint8
ed_postgraduate      uint8
ed_undergraduate     uint8
dtype: object

In [35]:
df2=df.drop("ed", axis=1)
df2=pd.concat([df2, y], axis=1)
df2.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default,ed_college_degree,ed_high_school,ed_no_high_school,ed_postgraduate,ed_undergraduate
0,41,17,12,176,9.3,11.359392,5.008608,1,1,0,0,0,0
1,27,10,6,31,17.3,1.362202,4.000798,0,0,0,1,0,0
2,40,15,14,55,5.5,0.856075,2.168925,0,0,0,1,0,0
3,41,15,14,120,2.9,2.65872,0.82128,0,0,0,1,0,0
4,24,2,0,28,17.3,1.787436,3.056564,1,0,1,0,0,0


In [36]:
#### 1) to find out which component to include
corr_matr=df2.corr()
print(corr_matr)

                        age    employ   address    income   debtinc  creddebt  \
age                1.000000  0.536497  0.597591  0.478710  0.016398  0.295207   
employ             0.536497  1.000000  0.322334  0.619681 -0.031182  0.403694   
address            0.597591  0.322334  1.000000  0.316245  0.011323  0.208435   
income             0.478710  0.619681  0.316245  1.000000 -0.026777  0.570199   
debtinc            0.016398 -0.031182  0.011323 -0.026777  1.000000  0.501767   
creddebt           0.295207  0.403694  0.208435  0.570199  0.501767  1.000000   
othdebt            0.340217  0.406091  0.226514  0.610659  0.584870  0.633104   
default           -0.137657 -0.282978 -0.164451 -0.070970  0.389575  0.244740   
ed_college_degree  0.051031 -0.029161  0.058899  0.149451  0.032199  0.110548   
ed_high_school    -0.059646 -0.073388  0.001322  0.014392 -0.034308 -0.038828   
ed_no_high_school  0.015432  0.146061 -0.047697 -0.178107  0.005693 -0.056229   
ed_postgraduate    0.082148 

In [47]:
x_2 = df2.iloc[:, [2,5,6]].to_numpy()
y_2 = df2.iloc[:, 8:9]
y_2=np.ravel(y_2)
print(y_2)
y_2.shape
x_2.shape

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1
 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 1 0 

(700, 3)

In [48]:
scaler = MinMaxScaler()
x_2 = scaler.fit_transform(x_2)

x_train2, x_test2, y_train2, y_test2 = train_test_split(x_2, y_2, test_size=.2, random_state=0)

In [49]:
model2 = LogisticRegression()
model2.fit(x_train2, y_train2)
y_pred2 = model2.predict(x_test2)

In [50]:
print('acc:', accuracy_score(y_test2, y_pred2))
print('f1:', f1_score(y_test2, y_pred2))
print('recall:', recall_score(y_test2, y_pred2))
print('precision:', precision_score(y_test2, y_pred2))
print('confusion: ', confusion_matrix(y_test2, y_pred2))

acc: 0.8857142857142857
f1: 0.0
recall: 0.0
precision: 0.0
confusion:  [[124   0]
 [ 16   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
x_3 = df2.iloc[:, [2,5,6,11]].to_numpy()

x_3.shape

scaler = MinMaxScaler()
x_3 = scaler.fit_transform(x_3)

x_train3, x_test3, y_train3, y_test3 = train_test_split(x_3, y_2, test_size=.2, random_state=0)

model3 = LogisticRegression()
model3.fit(x_train3, y_train3)
model3.get_params()
y_pred3 = model3.predict(x_test3)

print('acc:', accuracy_score(y_test3, y_pred3))
print('f1:', f1_score(y_test3, y_pred3))
print('recall:', recall_score(y_test3, y_pred3))
print('precision:', precision_score(y_test3, y_pred3))
print('confusion: ', confusion_matrix(y_test3, y_pred3))


acc: 0.8857142857142857
f1: 0.0
recall: 0.0
precision: 0.0
confusion:  [[124   0]
 [ 16   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
y.shape

(700,)

In [57]:
### 2) use cross validation in the first model
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0).fit(x, y)
clf.predict(x[:2, :])
clf.predict_proba(x[:2, :]).shape
clf.score(x, y)

0.7714285714285715

In [63]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0).fit(x_2, y_2)
predicted_2=clf.predict(x_2)
clf.predict_proba(x_2).shape
clf.score(x_2, y_2)

0.8785714285714286

In [64]:
print('acc:', accuracy_score(y_2, predicted_2))


acc: 0.8785714285714286
