In [1]:
import xgboost as xgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from numba import jit
import math
import mlflow

import xgboost as xgb

from libs.extract import extract_url
from libs.metrics import smape_fast
from libs.data_engineering import prepareDataXGBoost
import time

  '(.+)_([a-z][a-z]\.)?((?:wikipedia\.org)|(?:commons\.wikimedia\.org)|(?:www\.mediawiki\.org))_([a-z_-]+?)$')


In [2]:
size_data = 1000
lag = 8
encoding = "oneHotEncoding"#oneHotEncoding, label
model = 'XGBoost'

In [3]:
train_1 = pd.read_csv('../Data/train_1.csv')
train_1_reduce = train_1[0:size_data]

In [4]:
tps1= time.clock()
data = prepareDataXGBoost(train_1_reduce,lag, encoding)
tps2= time.clock()
print("Temps d'exécution du calcul du lag:" + str(tps2-tps1) + " secondes")

  """Entry point for launching an IPython kernel.
  tps1 = time.clock()


Temps d'exécution de la fonction extract:0.017981999999999942 secondes
Temps d'exécution de la réorganisation des colonnes0.008966000000000918 secondes
Temps d'exécution du calcul du lag:92.309861 secondes
Temps d'exécution du calcul du shift:0.25137400000001264 secondes
Temps d'exécution du changement d'index:0.9389159999999919 secondes
Temps d'exécution de l'encoding:0.16391699999999787 secondes
Temps d'exécution du calcul du lag:93.70568 secondes


In [5]:
data.columns

Index(['diff1', 'diff2', 'diff3', 'diff4', 'diff5', 'diff6', 'diff7', 'diff8',
       'Visitors', 'Visitors_shift_7', 'Visitors_shift_90',
       'agent_all-access_spider', 'site_wikipedia.org', 'country_zh'],
      dtype='object')

In [6]:
metrics_df = pd.DataFrame(columns=['RMSE','SMAPE','MAPE'])

X = data.drop('Visitors', axis = 1).values
y = data['Visitors'].values

tscv = TimeSeriesSplit()
print(tscv)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    paramSearch = {'max_depth':[2,4,6], 'eta':[1]}
    for max_depth in paramSearch['max_depth']:
        print(max_depth)
        for eta in paramSearch['eta']:
            print(eta)
            with mlflow.start_run():
                num_round = 2
                param = {'max_depth': max_depth, 'eta': eta, 'objective=reg' :'tweedie'}
                bst = xgb.train(param, dtrain, num_round)
                preds = bst.predict(dtest)

                rmse = np.sqrt(mean_squared_error(y_test, preds))
                smape = smape_fast(y_test, preds)
                mape = np.mean(np.abs(preds - y_test)/np.abs(y_test))
                print("Test RMSE: %.3f" % rmse)
                print("Test SMAPE: %.3f" % smape)
                print("Test MAPE: %.3f" % mape)
                new_metrics = pd.DataFrame([(rmse,smape,mape)], columns = ['RMSE','SMAPE','MAPE'])
                metrics_df= metrics_df.append(new_metrics, ignore_index=True)
                
                mean_RMSE = metrics_df['RMSE'].mean()
                sum_RMSE = metrics_df['RMSE'].sum()
                mean_SMAPE = metrics_df['SMAPE'].mean()
                mean_MAPE = metrics_df['MAPE'].mean()
                
                mlflow.log_param("lag",lag)
                mlflow.log_param("encoding",encoding)
                mlflow.log_param("model",model)
                mlflow.log_param("max_depth",param['max_depth'])
                mlflow.log_param("eta",param['eta'])
                mlflow.log_param("objective",param['objective=reg'])
                mlflow.log_metric('mean_RMSE', mean_RMSE)
                mlflow.log_metric('sum_RMSE', sum_RMSE)
                mlflow.log_metric('mean_SMAPE', mean_SMAPE)
                mlflow.end_run()

TimeSeriesSplit(max_train_size=None, n_splits=5)
TRAIN: [    0     1     2 ... 91667 91668 91669] TEST: [ 91670  91671  91672 ... 183333 183334 183335]
2
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 142.262
Test SMAPE: 77.118
Test MAPE: inf
4
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 166.061
Test SMAPE: 69.297
Test MAPE: inf
6
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 156.197
Test SMAPE: 70.909
Test MAPE: inf
TRAIN: [     0      1      2 ... 183333 183334 183335] TEST: [183336 183337 183338 ... 274999 275000 275001]
2
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 126.281
Test SMAPE: 71.179
Test MAPE: inf
4
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 142.438
Test SMAPE: 66.100
Test MAPE: inf
6
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 129.544
Test SMAPE: 57.674
Test MAPE: inf
TRAIN: [     0      1      2 ... 274999 275000 275001] TEST: [275002 275003 275004 ... 366665 366666 366667]
2
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 734.452
Test SMAPE: 63.373
Test MAPE: inf
4
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 732.652
Test SMAPE: 56.136
Test MAPE: inf
6
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 750.287
Test SMAPE: 56.655
Test MAPE: inf
TRAIN: [     0      1      2 ... 366665 366666 366667] TEST: [366668 366669 366670 ... 458331 458332 458333]
2
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 226.559
Test SMAPE: 63.057
Test MAPE: inf
4
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 285.100
Test SMAPE: 62.511
Test MAPE: inf
6
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 257.337
Test SMAPE: 56.452
Test MAPE: inf
TRAIN: [     0      1      2 ... 458331 458332 458333] TEST: [458334 458335 458336 ... 549997 549998 549999]
2
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 88.728
Test SMAPE: 65.968
Test MAPE: inf
4
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Test RMSE: 67.493
Test SMAPE: 64.241
Test MAPE: inf
6
1
Parameters: { objective=reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Test RMSE: 59.448
Test SMAPE: 57.588
Test MAPE: inf


