In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 로지스틱 회귀

In [3]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = cancer.data # cancer["data"]
Y = cancer.target

In [14]:
# 데이터분할
from sklearn.model_selection import train_test_split
xtr, xte, ytr, yte = train_test_split(
    X, Y, random_state = 100, stratify = Y
)
xtr.shape, xte.shape, ytr.shape, yte.shape

((426, 30), (143, 30), (426,), (143,))

In [12]:
# 종류별 분류비율
pd.Series(cancer.target).value_counts(normalize=True)

1    0.627417
0    0.372583
dtype: float64

In [15]:
pd.Series(yte).value_counts(normalize=True)

1    0.629371
0    0.370629
dtype: float64

In [18]:
# 모델생성
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000).fit(xtr,ytr)
model.score(xtr,ytr), model.score(xte, yte)

(0.9741784037558685, 0.9440559440559441)

# 타이타닉 생존자 예측

In [22]:
print(sns.get_dataset_names())
ti = sns.load_dataset("titanic")
ti

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [23]:
# 결측치 처리 - 결측치는 학습시 오류발생
ti.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [26]:
# 나이 결측치는 평균으로 채우기
ti["age"].fillna( round(ti["age"].mean(),1), inplace=True )

In [52]:
ti[ ti["embarked"].isna() ]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
61,1,1,female,38.0,0,0,80.0,,First,woman,False,B,,yes,True
829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


In [57]:
ti["embarked"].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [58]:
ti["embarked"].fillna("S", inplace=True)

In [59]:
# 독립변수, 종속변수 지정
Y = ti['survived'] # 1, 0
# X = ti.loc[ :, 'pclass':"fare"].drop(columns="sex")
X = ti.loc[ :, 'pclass':"embarked"]
X = pd.get_dummies(X)
X

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,19.0,0,0,30.0000,1,0,0,0,1
888,3,29.7,1,2,23.4500,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,1,1,0,0


In [60]:
# 훈련,테스트데이터로 분할
from sklearn.model_selection import train_test_split
xtr,xte,ytr,yte = train_test_split(
    X, Y, random_state=0, stratify=Y
)

In [61]:
# 모델
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000).fit(xtr,ytr)
model.score(xtr,ytr), model.score(xte,yte)

(0.8038922155688623, 0.7937219730941704)

In [62]:
# sex 제외 (0.7125748502994012, 0.6860986547085202)