## 분류 모델 사용

## 데이터 수집

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl # 한글 폰트 설정 (NanumGothic) 
mpl.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지
import seaborn as sns
import pandas as pd
import numpy as np 
import scipy.stats as stats
from sklearn import datasets 
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

In [2]:
data_df = pd.read_csv('../../datasets/titanic_disaster_train.csv')
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 데이터 분석

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
data_df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

## 데이터 전처리 

### 결측치 처리

In [5]:
data_df.dropna(inplace=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, 1 to 889
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  183 non-null    int64  
 1   Survived     183 non-null    int64  
 2   Pclass       183 non-null    int64  
 3   Name         183 non-null    object 
 4   Sex          183 non-null    object 
 5   Age          183 non-null    float64
 6   SibSp        183 non-null    int64  
 7   Parch        183 non-null    int64  
 8   Ticket       183 non-null    object 
 9   Fare         183 non-null    float64
 10  Cabin        183 non-null    object 
 11  Embarked     183 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 18.6+ KB


In [6]:
data_df['Survived'].value_counts()

Survived
1    123
0     60
Name: count, dtype: int64

### 이상치 처리

### 범주형 처리
- OneHotEncoding 처리

In [7]:
data_df['Pclass'].value_counts()

Pclass
1    158
2     15
3     10
Name: count, dtype: int64

In [8]:
data_df['Pclass'].shape, data_df[['Pclass']].shape

((183,), (183, 1))

In [9]:
from sklearn.preprocessing import OneHotEncoder

onehot_pclass = OneHotEncoder()
onehot_pclass.fit(X=data_df[['Pclass']]) # 행열 이여야됨. 열만 들어가면 안됨.

In [10]:
onehot_pclass.categories_

[array([1, 2, 3])]

In [11]:
encoded_pclass = onehot_pclass.transform(X=data_df[['Pclass']]).toarray()

In [12]:
encoded_pclass.shape,type(encoded_pclass)

((183, 3), numpy.ndarray)

In [13]:
# 원본과 merge 위해 데이터 프레임으로 변경
pclass_name_list = onehot_pclass.get_feature_names_out(input_features=['Pclass'])
pclass_df = pd.DataFrame(data=encoded_pclass, columns=pclass_name_list, index=data_df.index)
pclass_df.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
1,1.0,0.0,0.0
3,1.0,0.0,0.0
6,1.0,0.0,0.0
10,0.0,0.0,1.0
11,1.0,0.0,0.0


In [14]:
titanic_concated_df = pd.concat([data_df, pclass_df], axis=1)
titanic_concated_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0,0.0,0.0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1.0,0.0,0.0
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,0.0,0.0,1.0
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S,1.0,0.0,0.0


### 스케일링
- 지수화 와 유사
- 딥러닝에서는 정규화
- 데이터 형태에 따른 스케일링 방법이 다름.

## 데이터 분할

In [25]:
# PassengerId 유니크해서 통계적 의미 없으니 제외?
numeric_df = titanic_concated_df.select_dtypes(exclude=['object'])
features = numeric_df.drop(columns=['Survived', 'PassengerId', 'Pclass'])
label = data_df['Survived']
features.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

In [26]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, 1 to 889
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       183 non-null    float64
 1   SibSp     183 non-null    int64  
 2   Parch     183 non-null    int64  
 3   Fare      183 non-null    float64
 4   Pclass_1  183 non-null    float64
 5   Pclass_2  183 non-null    float64
 6   Pclass_3  183 non-null    float64
dtypes: float64(5), int64(2)
memory usage: 11.4 KB


## 모델 학습 

In [27]:
from sklearn.linear_model import LogisticRegression

logisticregression = LogisticRegression()
logisticregression.fit(X=features, y=label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
logisticregression.coef_, logisticregression.intercept_

(array([[-0.03926592,  0.17913136, -0.32399644,  0.00491719,  0.25432947,
          0.5618283 , -0.54866189]]),
 array([1.64696407]))

## 모델 평가 

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
predicts = logisticregression.predict(X=features)
predicts

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1])

In [31]:
label

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: Survived, Length: 183, dtype: int64

In [32]:
accuracy_score(y_true=label,y_pred=predicts)

0.7049180327868853

In [33]:
predicts_two = logisticregression.predict(X=features[:2])
predicts_two

array([1, 1])

In [34]:
# 선택에 대한 확률이 결과로 나옴.
# 가장 높은 확률의 카테고리를 선택함.
logisticregression.predict_proba(X=features[:2])

array([[0.28113596, 0.71886404],
       [0.27543481, 0.72456519]])

## 모델 배포