In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
#     header = pd.read_csv("./data/blogData_label.csv", header=None)
#     header = list(header[0])
    
#     if len(header) != data.shape[1] :
#         raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

#     data.columns = header
    
    return data


In [3]:
import os

---

In [4]:
ROUND = lambda v: round(v, 4)

---

In [5]:

from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error


---

---

In [6]:
def evaluate_model() :
    u'''
    '''

# ---    
    class Eval_Estimator :
        u'''
        '''

        def __init__(self, name, estimator, params=None) :
            self.name = name
            self.estimator = estimator
            self.params = params

            return        
# ---
    data_raw = blogData_train_read()
    # test_raw = blogData_test_read()

    X_train = data_raw.iloc[:,0:280]
    y_train = data_raw.iloc[:,-1].copy()

    # X_test = test_raw.iloc[:,0:280]
    # y_test = test_raw.iloc[:,-1].copy()

    # X_train = X_train.iloc[:, 0:62].copy()
    # X_test = X_test.iloc[:, 0:62].copy()

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    # X_test = scaler.transform(X_test)
# ---
    model = Eval_Estimator(
        name='XGBRegressor L1 y L2', 
        estimator=xgb.XGBRegressor(), 
        params={
            'eval_metric' : 'rmse', # root mean square error
            'gamma': 1, # (min_split_loss) minimum loss reduction
            'learning_rate': 0.2, # (eta) step size shrinkage
            'max_depth': 8, # maximum depth of tree
            # 'max_delta_step' : 0.5, # for classification extremely imbalanced
            'n_estimators': 1000, 
            'n_jobs': -2, # jobs in parallel -2 : all processors minus one
            'objective' : 'reg:squarederror', # regression with squared loss
            'random_state': 127, 
            'reg_alpha': 1000, # (alpha) L1 regularization
            'reg_lambda': 1000, # (lambda) L2 regularization
            'subsample' : 0.1, # prevents overfitting
        }
    )
# ---
    model.estimator.set_params(**model.params)
    print('Entrenando modelo', model.name, '...')
    model.estimator.fit(X_train, y_train)
# ---
    class Eval_TestCase :
        u'''
        '''
        def __init__(self, case, data) :
            self.case = case
            self.data = data
            return    
# ---
    filepath = './data/test/'
    filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

    caselist = []
    for filename in filelist :
        temp_raw = pd.read_csv(filename, header=None)
        temp_raw.drop_duplicates(inplace=True)
        
        pos = filename.index('2012')
        caselist.append( Eval_TestCase(filename[pos: pos+10], temp_raw) )
# ---
    results = pd.DataFrame(columns=['case', 'count', 'y mean', 'y std', 'y min', 'y max', 'RMSE'])

    for e in caselist :
        X = e.data.iloc[:,0:280]
        y = e.data.iloc[:,-1]
        X = scaler.transform(X)
        p = model.estimator.predict(X)
        rmse = ROUND(np.sqrt(mean_squared_error(y, p)))

        results = results.append(
            pd.Series(
                data=[e.case, 
                      y.shape[0], 
                      ROUND(y.mean()), 
                      ROUND(y.std()), 
                      ROUND(y.min()), 
                      ROUND(y.max()), 
                      rmse
                     ], 
                index=results.columns), 
            ignore_index=True
        )
        
    return results
# ---
results = evaluate_model()

Entrenando modelo XGBRegressor L1 y L2 ...


In [20]:
display(results.sample(5))

Unnamed: 0,case,count,y mean,y std,y min,y max,RMSE
4,2012.02.05,91,5.4066,32.218,0.0,294.0,14.7515
40,2012.03.12,84,4.9524,17.0781,0.0,93.0,14.7889
15,2012.02.16,137,6.8832,40.0274,0.0,452.0,33.8244
1,2012.02.02,130,6.0462,31.6323,0.0,325.0,18.6974
31,2012.03.03,128,2.3516,15.6319,0.0,171.0,17.6127


In [21]:
display(results[['RMSE']].describe().transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RMSE,60.0,21.01826,9.549356,5.3397,13.5861,18.89485,26.362575,44.9422
