# 전복 나이 예측 경진대회

## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor 
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

  from pandas import MultiIndex, Int64Index


In [15]:
df = pd.read_csv('C:/Users/Jaesu/github_desktop/Dacon-Basic/전복-나이-예측/Data/train.csv')
test_ = pd.read_csv('C:/Users/Jaesu/github_desktop/Dacon-Basic/전복-나이-예측/Data/test.csv')

In [16]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

In [17]:
def drop_id(df: pd.DataFrame) -> None:
    try:
        df.drop('id', axis=1, inplace=True)
    except:
        pass

In [18]:
## 공백을 _(언더바)로 바꾼다
def replace_column_blank2underbar(df: pd.DataFrame) -> None:
    df.rename(columns = dict(zip(df.columns, [column.replace(' ', '_') for column in df.columns])), inplace=True) 

In [19]:
def split_train_target(df: pd.DataFrame, target: str, inplace=True) -> tuple:
    Target = df[target]
    train = df.drop(target, axis=1)
    if inplace:
        del df
    
    return train, Target

In [550]:
tmp = df.copy()
test = test_.copy()

In [551]:
drop_id(tmp)
drop_id(test)

In [552]:
replace_column_blank2underbar(tmp)
replace_column_blank2underbar(test)

In [553]:
tmp['Gender'] = tmp['Gender'].map({'I':0, 'F':1, 'M':2})
test['Gender'] = test['Gender'].map({'I':0, 'F':1, 'M':2})

In [554]:
tmp['Age'] = tmp['Target'].apply(lambda age: 'young' if age <= 15 else 'old') ## 오버샘플링을 위한 컬럼 

In [555]:
tmp['Shell_Water'] = tmp['Whole_Weight'] - tmp['Shucked_Weight'] - tmp['Shell_Weight'] ## 전복이 머금은 물의 무게
test['Shell_Water'] = test['Whole_Weight'] - test['Shucked_Weight'] - test['Shell_Weight']

In [556]:
tmp['Weight_Ratio'] =  tmp['Shucked_Weight'] / tmp['Whole_Weight'] ## 순수한 전복살 무게 / 전체 전복 무게
test['Weight_Ratio'] =  test['Shucked_Weight'] / test['Whole_Weight'] 

In [557]:
π = 3.141592

In [558]:
tmp['Radius'] = ((tmp['Lenght'] / 2*π) + (tmp['Diameter'] / 2)) / 2 ## 전복의 평균 반지름
test['Radius'] = ((test['Lenght'] / 2*π) + (test['Diameter'] / 2)) / 2

In [559]:
## 전복모양을 편의상 원이라고 가정 
tmp['Density'] = tmp['Shucked_Weight'] / (3 * (tmp['Height'] * (tmp['Radius'] ** 2))) ## 순살 무게 / 면적(반지름^2 * 키 * 1/3)
test['Density'] = test['Shucked_Weight'] / (3 * (test['Height'] * (test['Radius'] ** 2))) ## 순살 무게 / 면적(반지름^2 * 키 * 1/3)

`-` 이상치 제거(전복 무게가 전복 내장 무게보다 작다, 물의 무게가 음수이다)

In [560]:
tmp = tmp.drop(index=[47, 382, 435, 847, 1078, 465], axis=0)

`-` 15살이 넘는 전복 데이터를 over sampling

In [562]:
tmp = tmp.append(tmp[tmp['Age'] == 'old']).reset_index().drop('index', axis=1)

In [563]:
tmp.drop('Age', axis=1, inplace=True)

`-` 다중공선성 문제로 피쳐 드랍

In [564]:
def custom_drop(df):
    df.drop(['Lenght', 'Radius', 'Shucked_Weight'], axis=1, inplace=True)

In [565]:
custom_drop(tmp)
custom_drop(test)

In [566]:
tmp, target = split_train_target(tmp, 'Target')

In [479]:
skfold = StratifiedKFold(n_splits=5,  shuffle=True, random_state=22)

In [574]:
xgb = XGBRegressor(random_state=22, learning_rate=0.03, n_estimators=600, max_depth=6)

In [585]:
lgbm = LGBMRegressor(random_state=22, learning_rate=0.05, n_estimators=600, max_depth=5)

In [589]:
cb = CatBoostRegressor(random_state=222, learning_rate=0.03, n_estimators=600, depth=6, loss_function='MAE', verbose=0)

In [576]:
xgb_valid = []
xgb_predict = np.zeros(test.shape[0])

for fold, (train_idx, valid_idx) in enumerate(skfold.split(tmp, tmp['Gender'])):
    print(f'{fold + 1} Fold Training......')
    x_train, y_train = tmp.iloc[train_idx], target.iloc[train_idx]
    x_valid, y_valid = tmp.iloc[valid_idx], target.iloc[valid_idx]
    xgb.fit(x_train, y_train)

    y_predict = xgb.predict(x_valid) ## valid셋 예측
    nmae = NMAE(y_predict, y_valid) ## valid nmae
    
    xgb_predict += xgb.predict(test) / skfold.n_splits ## test셋 예측
    xgb_valid.append(nmae) ## valid nmae append
    print(f'NMAE : {nmae:.4f}', end='\n\n')
    
print(f'Mean NMAE : {np.mean(xgb_valid):.4f}')

1 Fold Training......
NMAE : 0.1507

2 Fold Training......
NMAE : 0.1464

3 Fold Training......
NMAE : 0.1385

4 Fold Training......
NMAE : 0.1219

5 Fold Training......
NMAE : 0.1245

Mean NMAE : 0.1364


In [586]:
lgbm_valid = []
lgbm_predict = np.zeros(test.shape[0])

for fold, (train_idx, valid_idx) in enumerate(skfold.split(tmp, tmp['Gender'])):
    print(f'{fold + 1} Fold Training......')
    x_train, y_train = tmp.iloc[train_idx], target.iloc[train_idx]
    x_valid, y_valid = tmp.iloc[valid_idx], target.iloc[valid_idx]
    lgbm.fit(x_train, y_train)

    y_predict = lgbm.predict(x_valid) ## valid셋 예측
    nmae = NMAE(y_predict, y_valid) ## valid nmae
    
    lgbm_predict += lgbm.predict(test) / skfold.n_splits ## test셋 예측
    lgbm_valid.append(nmae) ## valid nmae append
    print(f'NMAE : {nmae:.4f}', end='\n\n')
    
print(f'Mean NMAE : {np.mean(lgbm_valid):.4f}')

1 Fold Training......
NMAE : 0.1624

2 Fold Training......
NMAE : 0.1590

3 Fold Training......
NMAE : 0.1484

4 Fold Training......
NMAE : 0.1341

5 Fold Training......
NMAE : 0.1388

Mean NMAE : 0.1485


In [590]:
cb_valid = []
cb_predict = np.zeros(test.shape[0])

for fold, (train_idx, valid_idx) in enumerate(skfold.split(tmp, tmp['Gender'])):
    print(f'{fold + 1} Fold Training......')
    x_train, y_train = tmp.iloc[train_idx], target.iloc[train_idx]
    x_valid, y_valid = tmp.iloc[valid_idx], target.iloc[valid_idx]
    cb.fit(x_train, y_train)

    y_predict = cb.predict(x_valid) ## valid셋 예측
    nmae = NMAE(y_predict, y_valid) ## valid nmae
    
    cb_predict += cb.predict(test) / skfold.n_splits ## test셋 예측
    cb_valid.append(nmae) ## valid nmae append
    print(f'NMAE : {nmae:.4f}', end='\n\n')
    
print(f'Mean NMAE : {np.mean(cb_valid):.4f}')

1 Fold Training......
NMAE : 0.1552

2 Fold Training......
NMAE : 0.1637

3 Fold Training......
NMAE : 0.1596

4 Fold Training......
NMAE : 0.1455

5 Fold Training......
NMAE : 0.1460

Mean NMAE : 0.1540


In [587]:
submission = pd.read_csv('C:/Users/Jaesu/github_desktop/Dacon-Basic/전복-나이-예측/Data/sample_submission.csv')

In [591]:
pred = (xgb_predict + lgbm_predict + cb_predict) / 3

In [592]:
submission['Target'] = np.round(pred)
submission.head()

Unnamed: 0,id,Target
0,1,12.0
1,2,13.0
2,3,5.0
3,4,11.0
4,5,10.0


In [596]:
submission.to_csv('C:/Users/Jaesu/github_desktop/Dacon-Basic/전복-나이-예측/Data/submission_round.csv', index=False)