<a href="https://colab.research.google.com/github/HwangHanJae/Dacon_tutorial/blob/main/penguin_weight/penguin_weight_predcict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[데이콘] 펭귄 몸무게 예측하기


펭귄 몸무게 데이터
- id : 샘플 아이디
- Species: 펭귄의 종을 나타내는 문자열
- Island : 샘플들이 수집된 Palmer Station 근처 섬 이름
- Clutch Completion : 관찰된 펭귄 둥지의 알이 2개인 경우 Full Clutch이며 Yes로 표기
- Culmen Length (mm) : 펭귄 옆모습 기준 부리의 가로 길이
- Culmen Depth (mm) : 펭귄 옆모습 기준 부리의 세로 길이
- Flipper Length (mm) : 펭귄의 팔(날개) 길이
- Sex : 펭귄의 성별
- Delta 15 N (o/oo)  : 토양에 따라 변화하는 안정 동위원소 15N:14N의 비율
- Delta 13 C (o/oo) : 먹이에 따라 변화하는 안정 동위원소 13C:12C의 비율
- Body Mass (g): 펭귄의 몸무게를 나타내는 숫자 (g)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [4]:
train_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/dataset_penguin_weight/train.csv"
test_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/dataset_penguin_weight/test.csv"
submission_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/dataset_penguin_weight/sample_submission.csv"

df = pd.read_csv(train_path)
df.head()

Unnamed: 0,id,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
0,0,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,50.0,15.3,220,MALE,8.30515,-25.19017,5550
1,1,Chinstrap penguin (Pygoscelis antarctica),Dream,No,49.5,19.0,200,MALE,9.63074,-24.34684,3800
2,2,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,45.1,14.4,210,FEMALE,8.51951,-27.01854,4400
3,3,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,44.5,14.7,214,FEMALE,8.20106,-26.16524,4850
4,4,Gentoo penguin (Pygoscelis papua),Biscoe,No,49.6,16.0,225,MALE,8.38324,-26.84272,5700


In [5]:
#결측값 확인
df.isnull().sum()

id                     0
Species                0
Island                 0
Clutch Completion      0
Culmen Length (mm)     0
Culmen Depth (mm)      0
Flipper Length (mm)    0
Sex                    3
Delta 15 N (o/oo)      3
Delta 13 C (o/oo)      3
Body Mass (g)          0
dtype: int64

결측값으로는 Sex, Delta 15 N, Delta 13 C 가 3개씩 존재함

In [7]:
#데이터 크기 확인
df.shape

(114, 11)

In [8]:
#데이터의 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   114 non-null    int64  
 1   Species              114 non-null    object 
 2   Island               114 non-null    object 
 3   Clutch Completion    114 non-null    object 
 4   Culmen Length (mm)   114 non-null    float64
 5   Culmen Depth (mm)    114 non-null    float64
 6   Flipper Length (mm)  114 non-null    int64  
 7   Sex                  111 non-null    object 
 8   Delta 15 N (o/oo)    111 non-null    float64
 9   Delta 13 C (o/oo)    111 non-null    float64
 10  Body Mass (g)        114 non-null    int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 9.9+ KB


4개의 float64, 3개의 int64, 4개의 object 타입이 존재함

In [24]:
#데이터 describe
df.describe()

Unnamed: 0,id,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
count,114.0,114.0,114.0,114.0,111.0,111.0,114.0
mean,56.5,44.613158,17.014912,203.052632,8.737634,-25.723051,4327.850877
std,33.052988,5.321829,1.941363,14.653425,0.567698,0.859786,781.766484
min,0.0,33.5,13.2,174.0,7.6322,-27.01854,2700.0
25%,28.25,40.325,15.225,190.0,8.272585,-26.434025,3675.0
50%,56.5,45.2,17.25,199.0,8.63259,-25.95541,4250.0
75%,84.75,49.075,18.6,216.0,9.264635,-25.005945,4850.0
max,113.0,55.1,21.1,231.0,10.02544,-24.10255,6300.0


Sex 칼럼의 경우 FEMALE, MALE 펭귄이 균등하게 분포되어 있기 때문에 결측치를 처리할때

Culmen Length (mm) <= 42인 데이터를 FEMALE로 분류 나머지는 MALE로 분류하겠음

In [42]:
#결측치 처리 함수 생성
def fill_na(df):
  df.loc[df["Culmen Length (mm)"] <= 42, "Sex"] = "FEMALE"
  df.loc[df["Culmen Length (mm)"] > 42, "Sex"] = "MALE"
  df["Delta 15 N (o/oo)"] = df["Delta 15 N (o/oo)"].fillna(df["Delta 15 N (o/oo)"].mean())
  df["Delta 13 C (o/oo)"] = df["Delta 13 C (o/oo)"].fillna(df["Delta 13 C (o/oo)"].mean())
  return df


In [43]:
df.isnull().sum().sum()

0

In [44]:
#LabelEncoding
def encoding(df):
  from sklearn.preprocessing import LabelEncoder

  columns = ["Species","Island","Clutch Completion","Sex"]
  for column in columns:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])

  return df

In [45]:
#불필요한 데이터 Drop
def drop_columns(df):
  drop_col = ["id"]
  df.drop(drop_col, axis=1, inplace = True)
  
  return df

In [46]:
def transform_data(df):
  fill_na(df)
  encoding(df)
  drop_columns(df)
  return df

In [47]:
#원본 데이터를 다시 불러오기

train_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/dataset_penguin_weight/train.csv"
test_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/dataset_penguin_weight/test.csv"
submission_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/dataset_penguin_weight/sample_submission.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sub = pd.read_csv(submission_path)

In [48]:
train = transform_data(train)
test = transform_data(test)

In [49]:
train.head()

Unnamed: 0,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
0,2,0,1,50.0,15.3,220,1,8.30515,-25.19017,5550
1,1,1,0,49.5,19.0,200,1,9.63074,-24.34684,3800
2,2,0,1,45.1,14.4,210,1,8.51951,-27.01854,4400
3,2,0,1,44.5,14.7,214,1,8.20106,-26.16524,4850
4,2,0,0,49.6,16.0,225,1,8.38324,-26.84272,5700


In [50]:
test.head()

Unnamed: 0,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,1,1,1,52.0,20.7,210.0,1,9.43146,-24.6844
1,2,0,1,55.9,17.0,228.0,1,8.3118,-26.35425
2,0,1,1,38.9,18.8,190.0,0,8.36936,-26.11199
3,1,1,1,45.2,16.6,191.0,1,9.62357,-24.78984
4,0,0,0,37.9,18.6,172.0,0,8.38404,-25.19837


In [52]:
#타겟데이터 분리
X = train.drop("Body Mass (g)", axis=1)
y = train["Body Mass (g)"]

print(X.shape, y.shape)

(114, 9) (114,)


In [58]:
#평가방식 코드화
def rmse(y_true, y_pred):
  from sklearn.metrics import mean_squared_error
  result = np.sqrt(mean_squared_error(y_true,y_pred))
  return result

In [69]:
#교차검증 실시
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
scorer = make_scorer(rmse, greater_is_better=False)
def cross_val(model, cv=5):
  score = cross_validate(model,X,y, cv=cv, n_jobs=-1,scoring = scorer)
  print(model.__class__.__name__)
  print("교차검증 점수 : ", score["test_score"])
  print("평균 점수 : ", np.mean(score["test_score"]))



In [71]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

rf = RandomForestRegressor(random_state=42)
tree = DecisionTreeRegressor()

models = [rf, tree]
for model in models:
  cross_val(model)


RandomForestRegressor
교차검증 점수 :  [-376.55786191 -394.96364376 -419.55722326 -282.38551932 -394.85075575]
평균 점수 :  -373.6630008005298
DecisionTreeRegressor
교차검증 점수 :  [-635.86213299 -438.12843064 -534.03203779 -487.562705   -522.124158  ]
평균 점수 :  -523.541892885205


In [73]:
#학습데이터(80%) / 테스트데이터(20%) 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(91, 9) (91,)
(23, 9) (23,)


In [87]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV

def grid_search(clf, parameters, cv=5):
    grid = GridSearchCV(estimator=clf, param_grid=parameters, scoring=scorer, n_jobs=-1, cv=cv)
    grid.fit(X, y)
    print(clf.__class__.__name__)
    print("GridSearchCV 최적 하이퍼 파라미터 : ", grid.best_params_)
    print("GridSearchCV 최적 평가지표 : {}",grid.best_score_)

    best_clf = grid.best_estimator_

    pred = best_clf.predict(X_test)
    metric = rmse(y_test, pred)
    print("테스트 세트에서의 {} 평가지표 : {}".format(clf.__class__.__name__, metric))

    return best_clf

In [88]:
rf = RandomForestRegressor(random_state=42)
parameters = {"max_depth" : [2,3,5,10],
              "min_samples_split" : [2,3,5],
              "min_samples_leaf" : [1,5,8],
              "n_estimators": [100, 500, 1000]}

best_model = grid_search(rf, parameters)

RandomForestRegressor
GridSearchCV 최적 하이퍼 파라미터 :  {'max_depth': 5, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 100}
GridSearchCV 최적 평가지표 : {} -343.9803547723918
테스트 세트에서의 RandomForestRegressor 평가지표 : 299.3664030060334


In [89]:
sub.head()

Unnamed: 0,id,Body Mass (g)
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [91]:
predict=best_model.predict(test)

In [97]:
sub["Body Mass (g)"] = predict

sub.to_csv("submission_rf_.csv", index=False)
my_sub = pd.read_csv("/content/submission_rf_.csv")
my_sub.head()

Unnamed: 0,id,Body Mass (g)
0,0,4398.552921
1,1,5511.689746
2,2,3865.393172
3,3,3502.530032
4,4,3688.249249
