## 분류 모델 사용

## 데이터 수집

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl # 한글 폰트 설정 (NanumGothic) 
mpl.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지
import seaborn as sns
import pandas as pd
import numpy as np 
import scipy.stats as stats
from sklearn import datasets 
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import lightgbm as lgb

In [2]:
data_df = pd.read_csv('../../datasets/titanic_disaster_train.csv')
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 데이터 분석

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
data_df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

## 데이터 전처리 

### 결측치 처리

### 이상치 처리

### 범주형 처리
- OneHotEncoding 처리

In [5]:
def split_categorical_numerical(df, unique_threshold=0.05):
    """
    데이터프레임을 범주형과 수치형으로 분리
    
    Parameters:
    - df: 입력 데이터프레임
    - unique_threshold: unique 값 비율 임계값 (기본값 0.05 = 5%)
    
    Returns:
    - categorical_df: 범주형 컬럼만 있는 데이터프레임
    - numerical_df: 수치형 컬럼만 있는 데이터프레임
    """
    
    categorical_cols = []
    numerical_cols = []
    
    n_rows = len(df)
    
    for col in df.columns:
        # unique 값의 비율 계산
        unique_ratio = len(df[col].unique()) / n_rows
        
        # object 타입이거나, unique 값 비율이 임계값보다 작으면 범주형으로 분류
        # df[col].dtype == 'object' or
        if unique_ratio < unique_threshold:
            print(f'{col}:{unique_ratio}')
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)
            
    return df[categorical_cols], df[numerical_cols]

In [6]:
# 사용 예시
cat_df, num_df = split_categorical_numerical(data_df)

Survived:0.002244668911335578
Pclass:0.003367003367003367
Sex:0.002244668911335578
SibSp:0.007856341189674524
Parch:0.007856341189674524
Embarked:0.004489337822671156


In [7]:
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Pclass    891 non-null    int64 
 2   Sex       891 non-null    object
 3   SibSp     891 non-null    int64 
 4   Parch     891 non-null    int64 
 5   Embarked  889 non-null    object
dtypes: int64(4), object(2)
memory usage: 41.9+ KB


In [8]:
num_df.info() # unique 에 가까운건 빼고 가자 null 많아도 빠짐

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Name         891 non-null    object 
 2   Age          714 non-null    float64
 3   Ticket       891 non-null    object 
 4   Fare         891 non-null    float64
 5   Cabin        204 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 41.9+ KB


In [9]:
cat_df.dropna(inplace=True)
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  889 non-null    int64 
 1   Pclass    889 non-null    int64 
 2   Sex       889 non-null    object
 3   SibSp     889 non-null    int64 
 4   Parch     889 non-null    int64 
 5   Embarked  889 non-null    object
dtypes: int64(4), object(2)
memory usage: 48.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df.dropna(inplace=True)


In [10]:
def run_onehotencode(data_df, model=OneHotEncoder()):

    columns = data_df.columns
    for col in columns :
        model.fit(X=data_df[[col]])
        encoded_pclass = model.transform(X=data_df[[col]]).toarray()
        pclass_name_list = model.get_feature_names_out(input_features=[col])
        pclass_df = pd.DataFrame(data=encoded_pclass, columns=pclass_name_list, index=data_df.index)
        # 원본 자르는거 추가 

        data_df = pd.concat([data_df.drop(columns=[col]), pclass_df], axis=1)

    return data_df

In [11]:
onehot_df = run_onehotencode(cat_df.drop(columns=['Survived']))

In [12]:
# cat_df의 인덱스를 기준으로 num_df 필터링
num_df_filtered = num_df.loc[cat_df.index]

# 필터링된 데이터프레임 합치기
preprocessing_df = pd.concat([onehot_df, num_df_filtered[['Age','Ticket','Fare']]], axis=1)
preprocessing_df.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,...,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S,Age,Ticket,Fare
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,A/5 21171,7.25
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,38.0,PC 17599,71.2833
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,26.0,STON/O2. 3101282,7.925
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.0,113803,53.1
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.0,373450,8.05


In [13]:
preprocessing_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass_1    889 non-null    float64
 1   Pclass_2    889 non-null    float64
 2   Pclass_3    889 non-null    float64
 3   Sex_female  889 non-null    float64
 4   Sex_male    889 non-null    float64
 5   SibSp_0     889 non-null    float64
 6   SibSp_1     889 non-null    float64
 7   SibSp_2     889 non-null    float64
 8   SibSp_3     889 non-null    float64
 9   SibSp_4     889 non-null    float64
 10  SibSp_5     889 non-null    float64
 11  SibSp_8     889 non-null    float64
 12  Parch_0     889 non-null    float64
 13  Parch_1     889 non-null    float64
 14  Parch_2     889 non-null    float64
 15  Parch_3     889 non-null    float64
 16  Parch_4     889 non-null    float64
 17  Parch_5     889 non-null    float64
 18  Parch_6     889 non-null    float64
 19  Embarked_C  889 non-null    float6

### 스케일링
- 지수화 와 유사
- 딥러닝에서는 정규화
- 데이터 형태에 따른 스케일링 방법이 다름.

## 데이터 분할

## 모델 학습 

## 모델 평가 

## 모델 배포