# Import Library

In [2]:
# for data manipulating
import pandas as pd
import numpy as np

# for preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder

# for features selection 
from sklearn.feature_selection import RFE

# for visualization
import seaborn as sns
import shap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# for model
import xgboost as xgb
from sklearn.model_selection import train_test_split

# for evaluation
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

import warnings

# Menonaktifkan peringatan
warnings.simplefilter(action='ignore')

setattr(pd, "Int64Index", pd.Index)
setattr(pd, "Float64Index", pd.Index)

# Import Dataset

In [3]:
df = pd.read_csv(r"\Belajar\Jala Test\Dataset\sr_model.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,cycle_id,total_harvest(kg),total_seed,survival_rate(%),avg_pond_length,avg_pond_width,avg_pond_depth,long_cycle(day),total_feed(kg),avg_tray_number,avg_feed_remain%,count_fasting,total_mortal,avg_weight_mortal,avg_weight_sample
0,0,3458,444548.02,566669,78.45,80.09,60.07,1.2,112,9987.0,0.0,0.0,0.0,0.0,0.0,13.59
1,1,3459,440387.88,566669,77.72,78.62,58.97,1.2,121,10913.0,0.0,0.0,0.0,0.0,0.0,14.322308
2,2,4036,154350.0,172250,89.61,39.5,18.4,1.1,39,480.1,0.0,0.0,1.0,0.0,0.0,0.0


# Data Understanding

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2344 entries, 0 to 2343
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2344 non-null   int64  
 1   cycle_id           2344 non-null   int64  
 2   total_harvest(kg)  2344 non-null   float64
 3   total_seed         2344 non-null   int64  
 4   survival_rate(%)   2344 non-null   float64
 5   avg_pond_length    2344 non-null   float64
 6   avg_pond_width     2344 non-null   float64
 7   avg_pond_depth     2344 non-null   float64
 8   long_cycle(day)    2344 non-null   int64  
 9   total_feed(kg)     2344 non-null   float64
 10  avg_tray_number    2344 non-null   float64
 11  avg_feed_remain%   2344 non-null   float64
 12  count_fasting      2344 non-null   float64
 13  total_mortal       2344 non-null   float64
 14  avg_weight_mortal  2344 non-null   float64
 15  avg_weight_sample  2344 non-null   float64
dtypes: float64(12), int64(4)

In [5]:
df.isnull().sum()

Unnamed: 0           0
cycle_id             0
total_harvest(kg)    0
total_seed           0
survival_rate(%)     0
avg_pond_length      0
avg_pond_width       0
avg_pond_depth       0
long_cycle(day)      0
total_feed(kg)       0
avg_tray_number      0
avg_feed_remain%     0
count_fasting        0
total_mortal         0
avg_weight_mortal    0
avg_weight_sample    0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
# Anomali, selanjutnya akan di drop
df[df['survival_rate(%)'] > 100]

Unnamed: 0.1,Unnamed: 0,cycle_id,total_harvest(kg),total_seed,survival_rate(%),avg_pond_length,avg_pond_width,avg_pond_depth,long_cycle(day),total_feed(kg),avg_tray_number,avg_feed_remain%,count_fasting,total_mortal,avg_weight_mortal,avg_weight_sample
3,3,4038,405078.84,350000,115.74,60.00,45.00,1.5,94,7790.60,0.0,0.0,0.0,0.0,0.0,10.585366
4,4,4039,219808.62,210000,104.67,37.06,27.79,1.9,94,3770.50,0.0,0.0,0.0,0.0,0.0,10.881250
16,16,4254,301500.00,300000,100.50,0.00,0.00,0.0,30,597.00,0.0,0.0,1.0,0.0,0.0,0.000000
18,18,4349,112000.00,80000,140.00,20.00,20.00,1.5,54,816.50,0.0,0.0,0.0,0.0,0.0,0.000000
26,26,4552,12800.00,12000,106.67,4.00,6.00,1.2,37,32.62,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2329,2329,29393,56280.00,50000,112.56,26.00,25.00,1.2,50,578.90,0.0,0.0,0.0,120.0,18.0,8.300000
2332,2332,29451,156250.00,150000,104.17,20.00,20.00,1.5,53,1052.00,0.0,0.0,9.0,0.0,0.0,5.463333
2335,2335,29518,46200.00,40000,115.50,22.00,17.00,1.0,48,358.30,0.0,0.0,3.0,0.0,0.0,7.140000
2339,2339,29619,137250.00,70000,196.07,24.00,9.00,1.2,49,1043.40,1.0,0.0,1.0,0.0,0.0,5.635000


# Model

In [8]:
# filtered data
df_model = df[df['survival_rate(%)'] < 100].drop(['Unnamed: 0','cycle_id'],axis=1)

# variable
x = df_model.drop(['survival_rate(%)'],axis=1)
y = df_model[['survival_rate(%)']]

# split data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [9]:
model_xgb = xgb.XGBRegressor(
    base_score=0.5, 
    booster='gbtree',    
    n_estimators=1000,
    early_stopping_rounds=50,
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.01
    )

In [10]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=50,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [11]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)

df_pred

Unnamed: 0,survival_rate(%),pred,ape
313,49.68,49.904808,0.452512
839,94.09,88.680016,5.749797
384,99.46,88.549873,10.969361
2295,58.04,59.349060,2.255445
1824,57.30,60.746620,6.015044
...,...,...,...
1801,33.97,36.472164,7.365806
380,75.70,80.338318,6.127236
532,61.10,63.058968,3.206166
371,91.04,79.763229,12.386611


In [12]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 5.4025
MDAPE: 5.65 %


**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse dan mdape, umumnya selisih error yang dihasilkan model sebesar 5% dari data aktual

- Cross Validation 

In [13]:
def rmse(y_test, y_pred):
    return round(np.sqrt(mean_squared_error(y_test, y_pred)),4)

rmse_scorer = make_scorer(rmse)

rmse_cv = cross_val_score(model_xgb, x, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [14]:
rmse_cv

array([10.942 ,  5.2012,  4.5583,  4.843 ,  6.3109])

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse yang cukup baik dengan cross validation dengan selisih 4 s/d 12 % 

# Features Selection

In [15]:
df_model.corr('spearman')['survival_rate(%)'].sort_values(ascending=False)

survival_rate(%)     1.000000
total_harvest(kg)    0.603732
total_feed(kg)       0.347889
long_cycle(day)      0.232922
avg_weight_sample    0.168640
total_seed           0.155158
avg_pond_depth       0.099701
avg_pond_width       0.051408
avg_tray_number     -0.028454
avg_feed_remain%    -0.041668
avg_weight_mortal   -0.053748
avg_pond_length     -0.074390
total_mortal        -0.120099
count_fasting       -0.176216
Name: survival_rate(%), dtype: float64

**__Catatan__** : <p>
Berdasarkan hasil korelasi, akan fokus pada fitur dengan tingkat korelasi > 0,10

In [16]:
# drop features
x_new = x.drop(['avg_pond_depth', 'avg_pond_width', 'avg_tray_number', 'avg_feed_remain%','avg_weight_mortal','avg_pond_length'],axis=1)

# split data 
x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.2, random_state=7)

In [17]:
x_new.columns.tolist()

['total_harvest(kg)',
 'total_seed',
 'long_cycle(day)',
 'total_feed(kg)',
 'count_fasting',
 'total_mortal',
 'avg_weight_sample']

In [18]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=50,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [19]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)

df_pred

Unnamed: 0,survival_rate(%),pred,ape
313,49.68,49.739861,0.120492
839,94.09,89.048149,5.358541
384,99.46,88.223701,11.297304
2295,58.04,59.592106,2.674200
1824,57.30,62.059013,8.305433
...,...,...,...
1801,33.97,35.208714,3.646493
380,75.70,80.031303,5.721669
532,61.10,63.156460,3.365728
371,91.04,79.245377,12.955430


In [20]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 5.2083
MDAPE: 5.36 %


In [21]:
rmse_cv = cross_val_score(model_xgb, x_new, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [22]:
rmse_cv

array([10.0543,  5.066 ,  4.4903,  5.4139,  5.9981])

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, model setelah feature selection lebih baik 

# Pickel Model

In [None]:
with open('model_xgb.pkl', 'wb') as file:
    pickle.dump(model_xgb, file)