In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

pima = pd.read_csv('datasets/diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

# X is a matrix, access the features we want in feature_cols
X = pima[feature_cols]

# y is a vector, hence we use dot to access 'label'
y = pima['Outcome']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

y_test.value_counts()

0    130
1     62
Name: Outcome, dtype: int64

In [103]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
best_fit = logreg.fit(X_train, y_train)

In [104]:
print("score: " + str(best_fit.score(X_train, y_train)))

score: 0.6961805555555556


In [105]:
y_pred = best_fit.predict(X_test)

In [106]:
np.mean(y_test == y_pred)

0.6770833333333334

In [107]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[114,  16],
       [ 46,  16]])

In [108]:
192/len(y)

0.25

In [109]:
def check(l1, l2):
    count = 0
    for i in range(1, len(l1)):
        if l1[i] == l2[i]:
            count += 1
            
    return count
check(list(y_pred), list(y_test))

130

In [110]:
y_pred_prob = logreg.predict_proba(X_test)

In [111]:
logreg.classes_

array([0, 1])

In [112]:
def thres_y_pred(l1, thres):
    temp = list()
    for elm in l1:
        if elm[1] < thres:
            temp.append(0)
        else:
            temp.append(1)
    return temp
        
y_pred = thres_y_pred(y_pred_prob, 0.5)

In [113]:
y_train.value_counts()
206/(370+206)

0.3576388888888889

In [114]:
y_pred_op = thres_y_pred(y_pred_prob, 0.3576388888888889)

In [115]:
from collections import Counter
Counter(y_pred_op)

Counter({1: 76, 0: 116})

In [116]:
confusion_matrix_1 = confusion_matrix(y_test, y_pred)
confusion_matrix_op = confusion_matrix(y_test, y_pred_op)

def get_metrics(confusion):
    acc = (confusion[0][0] + confusion[1][1])/(confusion[0][0] + confusion[1][1] + confusion[0][1] + confusion[1][0])
    
    prec = (confusion[1][1])/(confusion[1][1] + confusion[1][0])
    
    recall = (confusion[1][1])/(confusion[1][1] + confusion[0][1])
    
    clas = (confusion[0][1] + confusion[1][0])/(confusion[0][1] + confusion[1][1] + confusion[0][1] + confusion[1][0])
    
    spec = (confusion[0][0])/(confusion[0][0] + confusion[0][1])
    
    f1 = 2*((prec*recall)/(prec+recall))
    
    print("acc: " + str(acc))
    print("prec: " + str(prec))
    print("recall: " + str(recall))
    print("clas: " + str(clas))
    print("spec: " + str(spec))
    print("f1: " + str(f1))
    

get_metrics(confusion_matrix_1)

acc: 0.6770833333333334
prec: 0.25806451612903225
recall: 0.5
clas: 0.6595744680851063
spec: 0.8769230769230769
f1: 0.3404255319148936


In [117]:
get_metrics(confusion_matrix_op)

acc: 0.65625
prec: 0.5806451612903226
recall: 0.47368421052631576
clas: 0.4647887323943662
spec: 0.6923076923076923
f1: 0.5217391304347826


In [123]:
print(confusion_matrix_op)

[[90 40]
 [26 36]]


In [125]:
from sklearn.model_selection import cross_val_score 
import pandas as pd
from sklearn.linear_model import LogisticRegression

pima = pd.read_csv('datasets/diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

# X is a matrix,access the features we want in feature_cols
X = pima[feature_cols]

# y is a vector, hence we use dot to access 'label'
y = pima['Outcome']

print(y.value_counts()[0])
print(y.value_counts()[1])

logreg = LogisticRegression(class_weight={1: 500/268})
#logreg = LogisticRegression(class_weight={1: y.value_counts()[0]/y.value_counts()[1]})
# logreg = LogisticRegression()

all_accuracies = cross_val_score(estimator=logreg, X=X, y=y, cv=5, scoring='accuracy')
print(all_accuracies)
print(all_accuracies.mean())

all_f1 = cross_val_score(estimator=logreg, X=X, y=y, cv=5, scoring='f1')
print(all_f1)
print(all_f1.mean())

500
268
[0.64935065 0.65584416 0.64935065 0.70588235 0.65359477]
0.6628045157456922
[0.578125   0.55462185 0.54237288 0.64       0.576     ]
0.5782239460190857
