# Import Library

In [1]:
# for data manipulating
import pandas as pd
import numpy as np

# for preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder

# for features selection 
from sklearn.feature_selection import RFE

# for visualization
import seaborn as sns
import shap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# for model
import xgboost as xgb
from sklearn.model_selection import train_test_split

# for evaluation
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

import warnings

# Menonaktifkan peringatan
warnings.simplefilter(action='ignore')

setattr(pd, "Int64Index", pd.Index)
setattr(pd, "Float64Index", pd.Index)

pd.set_option('display.max_columns',None)
pd.options.display.float_format = '{:.2f}'.format

# Import Dataset

In [2]:
df = pd.read_csv(r"\Belajar\Jala Test\Dataset\price.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,cycle_id,size,weight,selling_price
0,0,4038.0,32.0,32.04,1121400.0
1,1,4038.0,55.0,5311.92,370793264.0
2,2,4038.0,68.0,425.48,27908935.0


# Data Understanding

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6294 entries, 0 to 6293
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     6294 non-null   int64  
 1   cycle_id       6294 non-null   float64
 2   size           6294 non-null   float64
 3   weight         6294 non-null   float64
 4   selling_price  6294 non-null   float64
dtypes: float64(4), int64(1)
memory usage: 246.0 KB


In [4]:
df.isnull().sum()

Unnamed: 0       0
cycle_id         0
size             0
weight           0
selling_price    0
dtype: int64

In [5]:
df.duplicated().sum()

0

# Model

In [6]:
# filtered data
df_model = df.drop(['Unnamed: 0','cycle_id'],axis=1)

# variable
x = df_model.drop(['selling_price'],axis=1)
y = df_model[['selling_price']]

# split data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [7]:
model_xgb = xgb.XGBRegressor(
    base_score=0.5, 
    booster='gbtree',    
    n_estimators=1100,
    early_stopping_rounds=50,
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.01
    )

In [8]:
model_xgb.fit(x_train, y_train)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=50,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1100, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [9]:
y_pred = model_xgb.predict(x_test)

df_pred = y_test.copy()
df_pred['pred'] = y_pred
df_pred['ape'] = (abs(df_pred[y.columns[0]] - df_pred.pred)/df_pred[y.columns[0]]*100)
df_pred.loc[df_pred['ape'] == np.inf, 'ape'] = np.nan

df_pred

Unnamed: 0,selling_price,pred,ape
1537,1858400.00,1199220.38,35.47
1971,53069152.50,62597800.00,17.96
135,73602271.68,54733132.00,25.64
4705,53998000.00,39293952.00,27.23
3143,38301032.00,26775334.00,30.09
...,...,...,...
6041,19165542.72,16251366.00,15.21
4858,672000000.00,116787088.00,82.62
3326,6071520.00,6653282.00,9.58
2172,246000.00,925030.31,276.03


In [10]:
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
mdape = round(df_pred.ape.median(),2)

print(f"RMSE: {rmse}")
print(f"MDAPE: {mdape} %")

RMSE: 323446831.0327
MDAPE: 24.63 %


**__Catatan__** : <p>
Berdasarkan hasil evaluasi, didapatkan hasil rmse dan mdape, umumnya selisih error mdape yang dihasilkan model sebesar 24% dan rmse  323.360.639 juta dari data aktual

- Cross Validation 

In [11]:
def rmse(y_test, y_pred):
    return round(np.sqrt(mean_squared_error(y_test, y_pred)),4)

rmse_scorer = make_scorer(rmse)

rmse_cv = cross_val_score(model_xgb, x, y, cv=5, scoring=rmse_scorer)

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [12]:
rmse_cv.tolist()

[68095500.931, 57118916.146, 25857083.0253, 327507273.5448, 319513652.6347]

**__Catatan__** : <p>
Berdasarkan hasil evaluasi, hal ini dikarenakan data yang kurang robust, atau harga yang bersifat cukup fluktuatif

# Pickle Model 

In [None]:
with open('model_xgb.pkl', 'wb') as file:
    pickle.dump(model_xgb, file)