In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
import os

In [4]:

def blogData_test_read() :
    u''' Reads and prepare data from blog feedback data test set
    
    '''

    filepath = './data/test/'
    filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

    test_raw = pd.DataFrame()

    for filename in filelist :
        temp_raw = pd.read_csv(filename, header=None)
        temp_raw.drop_duplicates(inplace=True)
        test_raw = test_raw.append(temp_raw)

    return test_raw

---

In [5]:
ROUND = lambda v: round(v, 4)

---

In [6]:

from sklearn.preprocessing import StandardScaler


In [7]:

import xgboost as xgb

from sklearn.metrics import mean_squared_error


---

In [9]:
data_raw = blogData_train_read()
test_raw = blogData_test_read()

X_train = data_raw.iloc[:,0:280]
y_train = data_raw.iloc[:,-1].copy()

X_test = test_raw.iloc[:,0:280]
y_test = test_raw.iloc[:,-1].copy()


# X_train = X_train.iloc[:, 0:62].copy()
# X_test = X_test.iloc[:, 0:62].copy()

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



---

In [10]:

class Eval_Estimator :
    u'''
    '''
    
    def __init__(self, name, estimator, params=None) :
        self.name = name
        self.estimator = estimator
        self.params = params
        
        return    

In [11]:
models = []

models.append( Eval_Estimator(
    name='XGBoost L1 y L2', 
    estimator=xgb.XGBRegressor(), 
    params= {
        'eval_metric' : 'rmse', # root mean square error
        'gamma': 1, # (min_split_loss) minimum loss reduction
        'learning_rate': 0.2, # (eta) step size shrinkage
        'max_depth': 8, # maximum depth of tree
        # 'max_delta_step' : 0.5, # for classification extremely imbalanced
        'n_estimators': 1000, 
        'n_jobs': -2, # jobs in parallel -2 : all processors minus one
        'objective' : 'reg:squarederror', # regression with squared loss
        'random_state': 127, 
        'reg_alpha': 1000, # (alpha) L1 regularization
        'reg_lambda': 1000, # (lambda) L2 regularization
        'subsample' : 0.1, # prevents overfitting
    }
))



In [12]:
eval_columns = ['model', 'params', 'train RMSE', 'test RMSE']
eval_result = pd.DataFrame(columns=eval_columns)

for m in models :
    m.estimator.set_params(**m.params)
    m.estimator.fit(X_train, y_train)
    
    y_pred_train = m.estimator.predict(X_train)
    rmse_train = ROUND(np.sqrt(mean_squared_error(y_train, y_pred_train)))

    y_pred_test = m.estimator.predict(X_test)
    rmse_test = ROUND(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    
    eval_result = eval_result.append(
        pd.Series(
            data=[m.name, 
                  m.params, 
                  rmse_train, 
                  rmse_test
                 ], 
            index=eval_result.columns), 
        ignore_index=True
    )

display(eval_result)

Unnamed: 0,model,params,train RMSE,test RMSE
0,XGBoost L1 y L2,"{'eval_metric': 'rmse', 'gamma': 1, 'learning_...",23.4191,23.2356


---

In [13]:
model = models[0]
print(model.name)

XGBoost L1 y L2


---

In [14]:
class Eval_Test :
    u'''
    '''
    
    def __init__(self, case, data) :
        self.case = case
        self.data = data
        return    

def eval_blogData_test() :
    u''' Reads and prepare data from blog feedback data test set
    
    '''

    filepath = './data/test/'
    filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

    temp = []
    for filename in filelist :
        temp_raw = pd.read_csv(filename, header=None)
        temp_raw.drop_duplicates(inplace=True)
        
        pos = filename.index('2012')
        temp.append( Eval_Test(filename[pos: pos+10], temp_raw) )

    return temp

temp = eval_blogData_test()



In [64]:
results = pd.DataFrame(columns=['case', 'count', 'y mean', 'y std', 'y min', 'y max', 'RMSE'])

for e in temp :
    # if m.sum() > 0 :
        X = e.data.iloc[:,0:280]
        y = e.data.iloc[:,-1]
        X = scaler.transform(X)
        p = model.estimator.predict(X)
        rmse = ROUND(np.sqrt(mean_squared_error(y, p)))

        results = results.append(
            pd.Series(
                data=[e.case, 
                      y.shape[0], 
                      ROUND(y.mean()), 
                      ROUND(y.std()), 
                      ROUND(y.min()), 
                      ROUND(y.max()), 
                      rmse
                     ], 
                index=results.columns), 
            ignore_index=True
        )
    

In [65]:
display(results)

Unnamed: 0,case,count,y mean,y std,y min,y max,RMSE
0,2012.02.01,113,7.7699,40.4976,0.0,402.0,23.9641
1,2012.02.02,130,6.0462,31.6323,0.0,325.0,18.6974
2,2012.02.03,115,7.3043,41.8691,0.0,428.0,31.2256
3,2012.02.04,101,2.5446,8.8584,0.0,57.0,7.8083
4,2012.02.05,91,5.4066,32.218,0.0,294.0,14.7515
5,2012.02.06,82,4.939,26.3717,0.0,234.0,15.8392
6,2012.02.07,102,4.6078,13.7842,0.0,104.0,30.8227
7,2012.02.08,122,9.6066,41.9735,0.0,298.0,32.1069
8,2012.02.09,144,6.2639,36.1968,0.0,381.0,25.9801
9,2012.02.10,135,4.9185,29.0674,0.0,327.0,16.8232


In [66]:
results.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
y mean,60.0,5.480815,2.123278,0.9175,4.064425,5.3562,6.74125,10.9417
y std,60.0,28.198525,13.081355,2.5152,16.81725,27.655,36.23945,56.0102
y min,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
y max,60.0,265.483333,146.204117,19.0,147.75,261.5,363.75,657.0
RMSE,60.0,21.01826,9.549356,5.3397,13.5861,18.89485,26.362575,44.9422
