# Import Library

In [1]:
# for data manipulating
import pandas as pd
import numpy as np

# for preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder

# for features selection 
from sklearn.feature_selection import RFE

# for visualization
import seaborn as sns
import shap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# for model
import xgboost as xgb
from sklearn.model_selection import train_test_split

# for evaluation
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

import warnings

# Menonaktifkan peringatan
warnings.simplefilter(action='ignore')

setattr(pd, "Int64Index", pd.Index)
setattr(pd, "Float64Index", pd.Index)

pd.set_option('display.max_columns',None)

# Import Dataset

In [2]:
df = pd.read_csv(r"\Belajar\Jala Test\Dataset\bio_model.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,cycle_id,harvested_at,weight,size,total_harvest(kg),total_seed,survival_rate(%),average_weight(gr),est_remaining_population,biomass(kg),avg_pond_length,avg_pond_width,avg_pond_depth,long_cycle(day),total_feed(kg),avg_tray_number,avg_feed_remain%,count_fasting,total_mortal,avg_weight_mortal,avg_weight_sample
0,0,3458,2020-05-13,1643.13,94.0,154454.22,566669,27.26,10.638298,412214.78,4385.263617,80.09,60.07,1.2,112,9987.0,0.0,0.0,0.0,0.0,0.0,13.59
1,1,3458,2020-05-26,1503.8,71.0,106769.8,566669,46.1,14.084507,305444.98,4302.041972,80.09,60.07,1.2,112,9987.0,0.0,0.0,0.0,0.0,0.0,13.59
2,2,3458,2020-06-13,1206.0,49.0,59094.0,566669,56.53,20.408163,246350.98,5027.57102,80.09,60.07,1.2,112,9987.0,0.0,0.0,0.0,0.0,0.0,13.59


# Data Understanding

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8014 entries, 0 to 8013
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                8014 non-null   int64  
 1   cycle_id                  8014 non-null   int64  
 2   harvested_at              8014 non-null   object 
 3   weight                    8014 non-null   float64
 4   size                      8014 non-null   float64
 5   total_harvest(kg)         8014 non-null   float64
 6   total_seed                8014 non-null   int64  
 7   survival_rate(%)          8014 non-null   float64
 8   average_weight(gr)        8014 non-null   float64
 9   est_remaining_population  8014 non-null   float64
 10  biomass(kg)               8014 non-null   float64
 11  avg_pond_length           8014 non-null   float64
 12  avg_pond_width            8014 non-null   float64
 13  avg_pond_depth            8014 non-null   float64
 14  long_cyc

In [4]:
df.isnull().sum()

Unnamed: 0                  0
cycle_id                    0
harvested_at                0
weight                      0
size                        0
total_harvest(kg)           0
total_seed                  0
survival_rate(%)            0
average_weight(gr)          0
est_remaining_population    0
biomass(kg)                 0
avg_pond_length             0
avg_pond_width              0
avg_pond_depth              0
long_cycle(day)             0
total_feed(kg)              0
avg_tray_number             0
avg_feed_remain%            0
count_fasting               0
total_mortal                0
avg_weight_mortal           0
avg_weight_sample           0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
# Anomali, selanjutnya akan di drop
df[df['biomass(kg)'] < 0]

Unnamed: 0.1,Unnamed: 0,cycle_id,harvested_at,weight,size,total_harvest(kg),total_seed,survival_rate(%),average_weight(gr),est_remaining_population,biomass(kg),avg_pond_length,avg_pond_width,avg_pond_depth,long_cycle(day),total_feed(kg),avg_tray_number,avg_feed_remain%,count_fasting,total_mortal,avg_weight_mortal,avg_weight_sample
14,14,4038,2020-07-15,5311.92,55.0,292155.60,350000,115.44,18.181818,-54053.56,-982.792000,60.00,45.00,1.5,94,7790.60,0.0,0.0,0.0,0.0,0.0,10.585366
15,15,4038,2020-07-15,32.04,32.0,1025.28,350000,115.74,31.250000,-55078.84,-1721.213750,60.00,45.00,1.5,94,7790.60,0.0,0.0,0.0,0.0,0.0,10.585366
20,20,4039,2020-07-15,2391.26,67.0,160214.42,210000,104.67,14.925373,-9808.62,-146.397313,37.06,27.79,1.9,94,3770.50,0.0,0.0,0.0,0.0,0.0,10.881250
67,67,4254,2020-06-01,603.00,500.0,301500.00,300000,100.50,2.000000,-1500.00,-3.000000,0.00,0.00,0.0,30,597.00,0.0,0.0,1.0,0.0,0.0,0.000000
70,70,4349,2020-07-03,560.00,200.0,112000.00,80000,140.00,5.000000,-32000.00,-160.000000,20.00,20.00,1.5,54,816.50,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7990,7990,29393,2024-03-30,340.00,80.0,27200.00,50000,112.56,12.500000,-6280.00,-78.500000,26.00,25.00,1.2,50,578.90,0.0,0.0,0.0,120.0,18.0,8.300000
7996,7996,29451,2024-03-29,300.00,150.0,45000.00,150000,104.17,6.666667,-6250.00,-41.666667,20.00,20.00,1.5,53,1052.00,0.0,0.0,9.0,0.0,0.0,5.463333
8001,8001,29518,2024-03-30,330.00,140.0,46200.00,40000,115.50,7.142857,-6200.00,-44.285714,22.00,17.00,1.0,48,358.30,0.0,0.0,3.0,0.0,0.0,7.140000
8007,8007,29619,2024-04-01,700.00,150.0,105000.00,70000,196.07,6.666667,-67250.00,-448.333333,24.00,9.00,1.2,49,1043.40,1.0,0.0,1.0,0.0,0.0,5.635000


# Model

In [7]:
# filtered data
df_model = df[df['biomass(kg)'] > 0].drop(['Unnamed: 0','cycle_id','harvested_at'],axis=1)

# variable
x = df_model.drop(['biomass(kg)'],axis=1)
y = df_model[['biomass(kg)']]

# split data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [8]:
model_xgb = xgb.XGBRegressor(
    base_score=0.5, 
    booster='gbtree',    
    n_estimators=1000,
    early_stopping_rounds=30,
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.01
    )

In [9]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=30,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [10]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)
df_pred.loc[df_pred['ape'] == np.inf, 'ape'] = np.nan

df_pred

Unnamed: 0,biomass(kg),pred,ape
3072,5204.554776,5145.443359,1.135763
539,726.955556,849.145935,16.808508
6628,231.204819,38.145054,83.501618
938,163.726000,341.622101,108.654765
7645,91.978261,68.460052,25.569312
...,...,...,...
5448,2534.370652,2516.575439,0.702155
4564,1982.846154,1862.843384,6.052046
1462,2010.171702,2168.948730,7.898680
3811,2592.708029,2478.921631,4.388709


In [11]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 464.3204
MDAPE: 6.29 %


**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse dan mdape, umumnya selisih error mdape yang dihasilkan model sebesar 25% dan rmse 0.2069 gram dari data aktual

- Cross Validation 

In [12]:
def rmse(y_test, y_pred):
    return round(np.sqrt(mean_squared_error(y_test, y_pred)),4)

rmse_scorer = make_scorer(rmse)

rmse_cv = cross_val_score(model_xgb, x, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [13]:
rmse_cv

array([ 307.4364,  260.1905,  540.6917,  417.0902, 4958.781 ])

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse cross validation dengan selisih 255 s/d 5591 kg 

# Features Selection

In [14]:
df_model.corr('spearman')['biomass(kg)'].sort_values(ascending=False)

biomass(kg)                 1.000000
est_remaining_population    0.876462
total_seed                  0.684203
total_feed(kg)              0.652387
avg_pond_width              0.564073
avg_pond_length             0.514073
average_weight(gr)          0.446780
long_cycle(day)             0.379502
avg_weight_sample           0.370587
weight                      0.287835
avg_pond_depth              0.282852
total_harvest(kg)           0.162767
avg_tray_number             0.138040
avg_feed_remain%            0.110749
avg_weight_mortal           0.097580
total_mortal                0.083387
count_fasting              -0.035297
size                       -0.446804
survival_rate(%)           -0.462548
Name: biomass(kg), dtype: float64

**__Catatan__** : <p>
Berdasarkan hasil korelasi, akan fokus pada fitur dengan tingkat korelasi > 0,20

In [15]:
# drop features
x_new = x.drop(['total_harvest(kg)', 'avg_tray_number', 'avg_feed_remain%', 'avg_weight_mortal', 'total_mortal', 'count_fasting', 'size', 'survival_rate(%)'],axis=1)

# split data 
x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.2, random_state=7)

In [16]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=30,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [17]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)

df_pred

Unnamed: 0,biomass(kg),pred,ape
3072,5204.554776,5137.844238,1.281772
539,726.955556,793.952393,9.216084
6628,231.204819,99.658333,56.896083
938,163.726000,342.628845,109.269661
7645,91.978261,91.708000,0.293831
...,...,...,...
5448,2534.370652,2515.693604,0.736950
4564,1982.846154,1891.348877,4.614442
1462,2010.171702,2121.439453,5.535236
3811,2592.708029,2466.743896,4.858400


In [18]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 468.1884
MDAPE: 5.6 %


In [19]:
rmse_cv = cross_val_score(model_xgb, x_new, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [20]:
rmse_cv

array([ 286.7328,  259.5425,  553.5822,  381.304 , 4967.3868])

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, model setelah feature selection lebih baik

# Pickle Model 

In [None]:
with open('model_xgb.pkl', 'wb') as file:
    pickle.dump(model_xgb, file)