In [135]:
import pandas as pd
import numpy as np
import sklearn
import warnings

warnings.filterwarnings('ignore')

In [136]:
# import Data
X_train = pd.read_csv('penguin_X_train.csv')
X_test = pd.read_csv('penguin_X_test.csv')
y_train = pd.read_csv('penguin_y_train.csv')

X_train = X_train.drop(['Unnamed: 0'], axis = 1)
X_test = X_test.drop(['Unnamed: 0'], axis = 1)
y_train = y_train.drop(['Unnamed: 0'], axis = 1)

In [137]:
y_train

Unnamed: 0,body_mass_g
0,4250.0
1,4650.0
2,4250.0
3,4500.0
4,5700.0
...,...
235,3800.0
236,5950.0
237,3200.0
238,3575.0


In [138]:
# Data Explore
print(X_train.info()) # 4가지 항목에 NA값 존재, NA 값 걍 날려주자

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB
None


In [139]:
print(X_train.head(5))

     species     island     sex  bill_length_mm  bill_depth_mm  \
0     Adelie  Torgersen     NaN            42.0           20.2   
1     Gentoo     Biscoe  FEMALE            43.5           15.2   
2     Adelie  Torgersen    MALE            42.8           18.5   
3  Chinstrap      Dream    MALE            53.5           19.9   
4     Gentoo     Biscoe    MALE            50.2           14.3   

   flipper_length_mm  
0              190.0  
1              213.0  
2              195.0  
3              205.0  
4              218.0  


In [140]:
train = pd.concat([X_train, y_train], axis = 1)

In [141]:
train

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,,42.0,20.2,190.0,4250.0
1,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0,4650.0
2,Adelie,Torgersen,MALE,42.8,18.5,195.0,4250.0
3,Chinstrap,Dream,MALE,53.5,19.9,205.0,4500.0
4,Gentoo,Biscoe,MALE,50.2,14.3,218.0,5700.0
...,...,...,...,...,...,...,...
235,Chinstrap,Dream,FEMALE,46.6,17.8,193.0,3800.0
236,Gentoo,Biscoe,MALE,49.8,15.9,229.0,5950.0
237,Adelie,Torgersen,FEMALE,34.6,17.2,189.0,3200.0
238,Chinstrap,Dream,FEMALE,45.9,17.1,190.0,3575.0


In [142]:
train = train.dropna(axis = 0)

In [143]:
y_train = train['body_mass_g']
X_train = train.drop(['body_mass_g'], axis = 1)

In [144]:
X_train

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
1,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0
2,Adelie,Torgersen,MALE,42.8,18.5,195.0
3,Chinstrap,Dream,MALE,53.5,19.9,205.0
4,Gentoo,Biscoe,MALE,50.2,14.3,218.0
5,Adelie,Dream,FEMALE,36.5,18.0,182.0
...,...,...,...,...,...,...
235,Chinstrap,Dream,FEMALE,46.6,17.8,193.0
236,Gentoo,Biscoe,MALE,49.8,15.9,229.0
237,Adelie,Torgersen,FEMALE,34.6,17.2,189.0
238,Chinstrap,Dream,FEMALE,45.9,17.1,190.0


In [145]:
from sklearn.model_selection import train_test_split

In [146]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.3)

In [147]:
COL_CAT = ['species', 'island', 'sex']
COL_NUM = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']

In [148]:
X_tr

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
169,Adelie,Dream,FEMALE,42.2,18.5,180.0
98,Chinstrap,Dream,MALE,52.0,19.0,197.0
214,Adelie,Dream,FEMALE,36.8,18.5,193.0
179,Chinstrap,Dream,FEMALE,50.9,17.9,196.0
21,Chinstrap,Dream,MALE,52.0,18.1,201.0
...,...,...,...,...,...,...
134,Adelie,Biscoe,MALE,41.1,18.2,192.0
94,Gentoo,Biscoe,FEMALE,45.1,14.4,210.0
62,Gentoo,Biscoe,MALE,51.5,16.3,230.0
93,Gentoo,Biscoe,FEMALE,48.4,14.4,203.0


In [149]:
X_tr_dum = pd.get_dummies(X_tr[COL_CAT])
X_val_dum = pd.get_dummies(X_val[COL_CAT])
X_test_dum = pd.get_dummies(X_test[COL_CAT])

In [151]:
X_tr_merge = pd.concat([X_tr, X_tr_dum], axis = 1)

In [154]:
X_val_merge = pd.concat([X_val, X_val_dum], axis = 1)
X_test_merge = pd.concat([X_test, X_test_dum], axis = 1)

In [155]:
X_tr_fin = X_tr_merge.drop(COL_CAT, axis = 1)

In [157]:
X_val_fin = X_val_merge.drop(COL_CAT, axis = 1)
X_test_fin = X_test_merge.drop(COL_CAT, axis = 1)

In [164]:
from sklearn.preprocessing import MinMaxScaler

In [165]:
scaler = MinMaxScaler()
scaler.fit(X_tr_fin[COL_NUM])
X_tr_fin[COL_NUM] = scaler.fit_transform(X_tr_fin[COL_NUM])
X_val_fin[COL_NUM] = scaler.fit_transform(X_val_fin[COL_NUM])
X_test_fin[COL_NUM] = scaler.fit_transform(X_test_fin[COL_NUM])

In [169]:
# Modeling
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [170]:
model_xgb = XGBRegressor()
model_xgb.fit(X_tr_fin, y_tr)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

In [171]:
model_rf = RandomForestRegressor()
model_rf.fit(X_tr_fin, y_tr)

RandomForestRegressor()

In [175]:
y_val_rf_pred = model_rf.predict(X_val_fin)

In [176]:
y_val_xgb_pred = model_xgb.predict(X_val_fin)

In [177]:
from sklearn.metrics import mean_squared_error, r2_score

rf_rmse = r2_score(y_val_rf_pred, y_val)
xgb_rmse = r2_score(y_val_xgb_pred, y_val)

In [178]:
print(rf_rmse, xgb_rmse)

0.7554412133586973 0.8057674395797043


In [180]:
# final predict
pred = model_xgb.predict(X_test_fin)