## 작업형 2유형 최종정리
- 작업형1 : 3문제 (30점), 데이터 전처리
- `작업형2 : 1문제 (40점), 분류/회귀 예측 모델링`
- 작업형3 : 2문제 (30점), 가설 검정

## 주요 라이브러리
- palmerpenguins : 팔머펭귄 데이터셋의 목표는 iris 데이터셋의 대안으로 데이터 탐색 및 시각화를 위한 데이터셋 제공.
- scikit-learn : 머신러닝을 위한 라이브러리
- lightgbm : LightGBM은 Microsoft에서 개발한 오픈 소스 기계 학습 라이브러리로, 대용량 데이터셋에서 빠른 속도와 높은 성능을 제공하는 것이 특징

## 주의
- 각 코드에 대한 설명은 별도로 하지 않습니다.

## 데이터 파일 불러오기

In [5]:
import pandas as pd
from palmerpenguins import load_penguins

df = load_penguins()
df['ID'] = df.index + 1
print(df)

       species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0       Adelie  Torgersen            39.1           18.7              181.0   
1       Adelie  Torgersen            39.5           17.4              186.0   
2       Adelie  Torgersen            40.3           18.0              195.0   
3       Adelie  Torgersen             NaN            NaN                NaN   
4       Adelie  Torgersen            36.7           19.3              193.0   
..         ...        ...             ...            ...                ...   
339  Chinstrap      Dream            55.8           19.8              207.0   
340  Chinstrap      Dream            43.5           18.1              202.0   
341  Chinstrap      Dream            49.6           18.2              193.0   
342  Chinstrap      Dream            50.8           19.0              210.0   
343  Chinstrap      Dream            50.2           18.7              198.0   

     body_mass_g     sex  year   ID  
0         375

## 데이터 확인

In [6]:
print(df.head())
print(df.info())
print(df.isnull().sum())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  year  ID  
0       3750.0    male  2007   1  
1       3800.0  female  2007   2  
2       3250.0  female  2007   3  
3          NaN     NaN  2007   4  
4       3450.0  female  2007   5  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 

In [7]:
df = df.dropna()
print(df.isnull().sum())

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
ID                   0
dtype: int64


## 데이터셋 분리

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['body_mass_g'])
y = df['body_mass_g']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(266, 8) (67, 8) (266,) (67,)


## 컬럼만 분리

In [9]:
X_train_id = X_train.pop('ID')
X_test_id = X_test.pop('ID')

## 데이터 타입별로 분리

In [10]:
import numpy as np
object_df = X_train.select_dtypes(include = object)
number_df = X_train.select_dtypes(include = np.number)

In [11]:
for column in object_df.columns:
    print(object_df[column].value_counts())

Adelie       115
Gentoo       101
Chinstrap     50
Name: species, dtype: int64
Biscoe       135
Dream         98
Torgersen     33
Name: island, dtype: int64
male      137
female    129
Name: sex, dtype: int64


In [12]:
print(number_df.describe())

       bill_length_mm  bill_depth_mm  flipper_length_mm         year
count      266.000000     266.000000         266.000000   266.000000
mean        44.079323      17.072556         201.409774  2008.041353
std          5.420164       1.969091          14.269156     0.816216
min         32.100000      13.100000         172.000000  2007.000000
25%         39.600000      15.400000         190.000000  2007.000000
50%         44.900000      17.200000         197.000000  2008.000000
75%         48.500000      18.600000         214.000000  2009.000000
max         59.600000      21.500000         231.000000  2009.000000


## 모델생성

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from lightgbm import LGBMRegressor

transformer = ColumnTransformer([
    ('scaler', MinMaxScaler(), number_df.columns),
    ('encoder', OneHotEncoder(), object_df.columns)
], remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', LGBMRegressor(random_state=42, max_depth=4, learning_rate=0.05))
])

pipeline.fit(X_train, y_train)

## 모델평가

In [40]:
from sklearn.metrics import mean_squared_error

def get_scores(model, X_train, X_test, y_train, y_test):
    train_predict = model.predict(X_train)
    test_predict = model.predict(X_test)
    train_score = np.sqrt(mean_squared_error(y_train, train_predict))
    test_score = np.sqrt(mean_squared_error(y_test, test_predict))
    return f'train : {train_score}, test : {test_score}'

print(get_scores(pipeline, X_train, X_test, y_train, y_test))

train : 230.18998130743833, test : 264.5996044635628


## 결과 제출

In [41]:
final_predict = pipeline.predict(X_test)
result = pd.DataFrame({
    'ID' : X_test_id,
    'preds' : final_predict
})
print(result)

      ID        preds
30    31  3238.743940
320  321  3496.256713
79    80  4124.280846
202  203  4538.336406
63    64  3876.368450
..   ...          ...
291  292  4118.073444
4      5  3484.030063
83    84  3871.533449
322  323  3460.867057
66    67  3444.358496

[67 rows x 2 columns]
