In [662]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [663]:
data = load_breast_cancer()
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [664]:
X = data.data
y = data.target

In [665]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [666]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [667]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [668]:
y_pred = model.predict(X_test)

In [669]:
accuracy_score = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

In [670]:
print(accuracy_score)
print(classification_report)
print(confusion_matrix)

0.9736842105263158
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[41  2]
 [ 1 70]]


# 타이타닉 데이터

In [671]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [672]:
titanic = sns.load_dataset('titanic')
print(titanic)

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  alo

In [673]:
titanic = titanic[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']].dropna()

In [674]:
titanic['sex'] = titanic['sex'].map({'male' : 0, 'female' : 1})
titanic = pd.get_dummies(titanic, columns = ['embarked'])

In [675]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,0,3,0,22.0,1,0,7.2500,False,False,True
1,1,1,1,38.0,1,0,71.2833,True,False,False
2,1,3,1,26.0,0,0,7.9250,False,False,True
3,1,1,1,35.0,1,0,53.1000,False,False,True
4,0,3,0,35.0,0,0,8.0500,False,False,True
...,...,...,...,...,...,...,...,...,...,...
885,0,3,1,39.0,0,5,29.1250,False,True,False
886,0,2,0,27.0,0,0,13.0000,False,False,True
887,1,1,1,19.0,0,0,30.0000,False,False,True
889,1,1,0,26.0,0,0,30.0000,True,False,False


In [676]:
X = titanic.drop('survived', axis = 1)
y = titanic['survived']

In [677]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [678]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [679]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [680]:
y_pred = model.predict(X_test)
accuracy_score = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

print(accuracy_score)
print(classification_report)
print(confusion_matrix)

0.7972027972027972
              precision    recall  f1-score   support

           0       0.77      0.91      0.83        80
           1       0.85      0.65      0.74        63

    accuracy                           0.80       143
   macro avg       0.81      0.78      0.79       143
weighted avg       0.81      0.80      0.79       143

[[73  7]
 [22 41]]


# 당뇨병 데이터셋

In [681]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [682]:
diabetes = pd.read_csv('diabetes_prediction_dataset.csv')
diabetes

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [683]:
diabetes['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [684]:
diabetes = diabetes.dropna()
diabetes = pd.get_dummies(diabetes, columns = ['smoking_history', 'gender'])

In [685]:
diabetes

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,gender_Female,gender_Male,gender_Other
0,80.0,0,1,25.19,6.6,140,0,False,False,False,False,True,False,True,False,False
1,54.0,0,0,27.32,6.6,80,0,True,False,False,False,False,False,True,False,False
2,28.0,0,0,27.32,5.7,158,0,False,False,False,False,True,False,False,True,False
3,36.0,0,0,23.45,5.0,155,0,False,True,False,False,False,False,True,False,False
4,76.0,1,1,20.14,4.8,155,0,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,True,False,False,False,False,False,True,False,False
99996,2.0,0,0,17.37,6.5,100,0,True,False,False,False,False,False,True,False,False
99997,66.0,0,0,27.83,5.7,155,0,False,False,False,True,False,False,False,True,False
99998,24.0,0,0,35.42,4.0,100,0,False,False,False,False,True,False,True,False,False


In [686]:
X = diabetes.drop('diabetes', axis = 1)
y = diabetes['diabetes']

In [687]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [688]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [689]:
X_train

array([[ 1.37871976e+00, -2.86309234e-01, -2.03558690e-01, ...,
        -1.18685419e+00,  1.18734314e+00, -1.41435500e-02],
       [ 1.68986587e+00, -2.86309234e-01, -2.03558690e-01, ...,
        -1.18685419e+00,  1.18734314e+00, -1.41435500e-02],
       [-1.77010751e-01, -2.86309234e-01, -2.03558690e-01, ...,
         8.42563482e-01, -8.42216517e-01, -1.41435500e-02],
       ...,
       [ 7.87021842e-04, -2.86309234e-01, -2.03558690e-01, ...,
        -1.18685419e+00,  1.18734314e+00, -1.41435500e-02],
       [-2.21460194e-01, -2.86309234e-01, -2.03558690e-01, ...,
         8.42563482e-01, -8.42216517e-01, -1.41435500e-02],
       [-8.43752400e-01, -2.86309234e-01, -2.03558690e-01, ...,
         8.42563482e-01, -8.42216517e-01, -1.41435500e-02]])

In [690]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)
print(accuracy_score)
print(classification_report)
print(confusion_matrix)

0.959
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.86      0.62      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000

[[18126   166]
 [  654  1054]]
