# Import Library

In [20]:
# for data manipulating
import pandas as pd
import numpy as np

# for preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder

# for features selection 
from sklearn.feature_selection import RFE

# for visualization
import seaborn as sns
import shap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# for model
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pickle

# for evaluation
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

import warnings

# Menonaktifkan peringatan
warnings.simplefilter(action='ignore')


setattr(pd, "Int64Index", pd.Index)
setattr(pd, "Float64Index", pd.Index)

# Import Dataset

In [2]:
df = pd.read_csv(r"\Belajar\Jala Test\Dataset\abw_model.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,cycle_id,sampled_at,average_weight,distance_date,average_daily_gain,avg_pond_length,avg_pond_width,avg_pond_depth,long_cycle(day),total_feed(kg),avg_tray_number,avg_feed_remain%,count_fasting,total_mortal,avg_weight_mortal,avg_weight_sample
0,0,3458.0,2020-04-10,4.37,0,0.0,80.09,60.07,1.2,112.0,9987.0,0.0,0.0,0.0,0.0,0.0,13.59
1,1,3458.0,2020-04-17,5.23,7,0.122857,80.09,60.07,1.2,112.0,9987.0,0.0,0.0,0.0,0.0,0.0,13.59
2,2,3458.0,2020-04-24,6.69,7,0.208571,80.09,60.07,1.2,112.0,9987.0,0.0,0.0,0.0,0.0,0.0,13.59


# Data Understanding

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15032 entries, 0 to 15031
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          15032 non-null  int64  
 1   cycle_id            15032 non-null  float64
 2   sampled_at          15032 non-null  object 
 3   average_weight      15032 non-null  float64
 4   distance_date       15032 non-null  int64  
 5   average_daily_gain  15032 non-null  float64
 6   avg_pond_length     15032 non-null  float64
 7   avg_pond_width      15032 non-null  float64
 8   avg_pond_depth      15032 non-null  float64
 9   long_cycle(day)     15032 non-null  float64
 10  total_feed(kg)      15032 non-null  float64
 11  avg_tray_number     15032 non-null  float64
 12  avg_feed_remain%    15032 non-null  float64
 13  count_fasting       15032 non-null  float64
 14  total_mortal        15032 non-null  float64
 15  avg_weight_mortal   15032 non-null  float64
 16  avg_

In [4]:
df.isnull().sum()

Unnamed: 0            0
cycle_id              0
sampled_at            0
average_weight        0
distance_date         0
average_daily_gain    0
avg_pond_length       0
avg_pond_width        0
avg_pond_depth        0
long_cycle(day)       0
total_feed(kg)        0
avg_tray_number       0
avg_feed_remain%      0
count_fasting         0
total_mortal          0
avg_weight_mortal     0
avg_weight_sample     0
dtype: int64

In [5]:
df.duplicated().sum()

0

# Model

In [6]:
# filtered data
df_model = df.drop(['Unnamed: 0','cycle_id','sampled_at','avg_weight_sample'],axis=1)

# variable
x = df_model.drop(['average_weight'],axis=1)
y = df_model[['average_weight']]

# split data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [7]:
model_xgb = xgb.XGBRegressor(
    base_score=0.5, 
    booster='gbtree',    
    n_estimators=1000,
    early_stopping_rounds=50,
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.01
    )

In [8]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=50,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [9]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)
df_pred.loc[df_pred['ape'] == np.inf, 'ape'] = np.nan

df_pred

Unnamed: 0,average_weight,pred,ape
4499,13.16,15.323476,16.439786
12217,16.45,18.095560,10.003405
11961,18.57,9.503084,48.825610
12560,2.10,4.675822,122.658180
13229,3.03,4.524728,49.330951
...,...,...,...
5655,5.03,12.002608,138.620443
10647,4.05,5.612367,38.576955
2313,4.94,8.120281,64.378162
10318,6.06,10.190615,68.161959


In [10]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 5.3419
MDAPE: 36.75 %


**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse dan mdape, umumnya selisih error mdape yang dihasilkan model sebesar 31% dan rmse 4.97 gram dari data aktual

- Cross Validation 

In [11]:
def rmse(y_test, y_pred):
    return round(np.sqrt(mean_squared_error(y_test, y_pred)),4)

rmse_scorer = make_scorer(rmse)

rmse_cv = cross_val_score(model_xgb, x, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [12]:
rmse_cv

array([6.0443, 5.1651, 5.744 , 6.6745, 4.9291])

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse cross validation dengan selisih 4 s/d 5 gram 

# Features Selection

In [13]:
df_model.corr('spearman')['average_weight'].sort_values(ascending=False)

average_weight        1.000000
average_daily_gain    0.548701
distance_date         0.315030
long_cycle(day)       0.289510
total_feed(kg)        0.243083
avg_pond_width        0.121966
avg_pond_length       0.106076
avg_pond_depth        0.087608
avg_feed_remain%     -0.004019
avg_tray_number      -0.004098
avg_weight_mortal    -0.018305
count_fasting        -0.042073
total_mortal         -0.054877
Name: average_weight, dtype: float64

**__Catatan__** : <p>
Berdasarkan hasil korelasi, akan fokus pada fitur dengan tingkat korelasi > 0,10

In [14]:
# drop features
x_new = x.drop(['avg_tray_number', 'avg_feed_remain%', 'avg_weight_mortal', 'avg_pond_length', 'total_mortal', 'count_fasting'],axis=1)

# split data 
x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.2, random_state=7)

In [15]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=50,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [16]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)

df_pred

Unnamed: 0,average_weight,pred,ape
4499,13.16,15.222843,15.675100
12217,16.45,17.901667,8.824721
11961,18.57,9.702844,47.749899
12560,2.10,4.262287,102.966054
13229,3.03,3.601591,18.864377
...,...,...,...
5655,5.03,11.795500,134.502978
10647,4.05,5.467902,35.009930
2313,4.94,8.517834,72.425784
10318,6.06,9.976996,64.636905


In [17]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 5.3477
MDAPE: 36.92 %


In [18]:
rmse_cv = cross_val_score(model_xgb, x_new, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [19]:
rmse_cv

array([6.0212, 5.1318, 5.7235, 6.7098, 4.8557])

In [22]:
df_model.columns

Index(['average_weight', 'distance_date', 'average_daily_gain',
       'avg_pond_length', 'avg_pond_width', 'avg_pond_depth',
       'long_cycle(day)', 'total_feed(kg)', 'avg_tray_number',
       'avg_feed_remain%', 'count_fasting', 'total_mortal',
       'avg_weight_mortal'],
      dtype='object')

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, model sebelum feature selection masih lebih baik 

# PIckle Model 

In [None]:
with open('model_xgb.pkl', 'wb') as file:
    pickle.dump(model_xgb, file)