# **Logistic Regression** Michael Julian Peter

***1. Import library***

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LogisticRegression, SGDRegressor, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn.model_selection import train_test_split, cross_val_score, KFold

import statsmodels.api as sm
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
import seaborn as sns

*2. Import dan load dataset breast cancer*

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
data = load_breast_cancer()
df = pd.DataFrame(np.c_[data['data'], data['target']], columns = np.append(data['feature_names'], ['target']))
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


***3. Membagi data menjadi X dan Y***

In [None]:
X = data.data
Y = data.target

***4. Membagi dataset menjadi 5 bagian untuk di training dan test***

In [None]:
kfold = KFold(n_splits=5, random_state=None, shuffle=False)

***5. Training dataset***

In [None]:
#menggunakan solver saga
log_rgr = LogisticRegression(penalty='l2', C=100, solver='sag', max_iter = 100, class_weight='balanced')

for train_index, test_index in kfold.split(X,Y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]

In [None]:
import warnings 
warnings.filterwarnings('ignore') 

results = cross_val_score(log_rgr, X_train, y_train, cv=kfold)
print("Accuracy: ", results, '\n')

Accuracy:  [0.89130435 0.95604396 0.91208791 0.9010989  0.94505495] 



In [None]:
log_rgr_fit = log_rgr.fit(X_train, y_train)
predictions = log_rgr_fit.predict(X_test)
predictions[1:6]

array([1, 1, 1, 0, 0])

In [None]:
predictions_nominal = [ "M" if x < 0.5 else "B" for x in predictions]
predictions_nominal[1:6]

['B', 'B', 'B', 'M', 'M']

In [None]:
cfm = confusion_matrix(y_test, predictions)

true_negative = cfm[0][0]
false_positive = cfm[0][1]
false_negative = cfm[1][0]
true_positive = cfm[1][1]

print('Confusion Matrix: \n', cfm, '\n')

print('True Negative:', true_negative)
print('False Positive:', false_positive)
print('False Negative:', false_negative)
print('True Positive:', true_positive)
print('Correct Predictions', 
      round((true_negative + true_positive) / len(predictions_nominal) * 100, 1), '%')

Confusion Matrix: 
 [[22  4]
 [ 7 80]] 

True Negative: 22
False Positive: 4
False Negative: 7
True Positive: 80
Correct Predictions 90.3 %


# **SGD Regressor**

In [None]:
import warnings 
warnings.filterwarnings('ignore') 

clf = SGDClassifier(loss ='log',max_iter=50000)
clf.fit(X_train, y_train)
y_predSGD = clf.predict(X_test)

In [None]:
import warnings 
warnings.filterwarnings('ignore') 

results = cross_val_score(clf, X_test, y_test)
print("Accuracy: ", results, '\n')

Accuracy:  [0.95652174 0.65217391 0.56521739 0.90909091 0.90909091] 



In [None]:
y_predSGD[1:6]

array([1, 1, 1, 0, 0])

In [None]:
predictions_nominal = [ "M" if x < 0.5 else "B" for x in y_predSGD]
predictions_nominal[1:6]

['B', 'B', 'B', 'M', 'M']

In [None]:
cfm = confusion_matrix(y_test, y_predSGD)

true_negative = cfm[0][0]
false_positive = cfm[0][1]
false_negative = cfm[1][0]
true_positive = cfm[1][1]

print('Confusion Matrix: \n', cfm, '\n')

print('True Negative:', true_negative)
print('False Positive:', false_positive)
print('False Negative:', false_negative)
print('True Positive:', true_positive)
print('Correct Predictions', 
      round((true_negative + true_positive) / len(predictions_nominal) * 100, 1), '%')

Confusion Matrix: 
 [[20  6]
 [ 1 86]] 

True Negative: 20
False Positive: 6
False Negative: 1
True Positive: 86
Correct Predictions 93.8 %
