In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
import os

In [4]:

def blogData_test_read() :
    u''' Reads and prepare data from blog feedback data test set
    
    '''

    filepath = './data/test/'
    filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

    test_raw = pd.DataFrame()

    for filename in filelist :
        temp_raw = pd.read_csv(filename, header=None)
        temp_raw.drop_duplicates(inplace=True)
        test_raw = test_raw.append(temp_raw)

    return test_raw

In [5]:
data = blogData_train_read()
data.shape

(49203, 281)

In [6]:
# data

In [7]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]
    
    labels['tl_post'] = columns[60:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


In [8]:

labels = blogData_labels(data)
target = 'comments'


---

In [9]:
ROUND = lambda v: round(v, 4)

---

In [10]:

from sklearn.preprocessing import StandardScaler


---
<a name="Regression_Models"></a>
### Regression Models

In [11]:

class Eval_Estimator :
    u'''
    '''
    
    def __init__(self, name, estimator, params=None) :
        self.name = name
        self.estimator = estimator
        self.params = params
        
        return    

In [12]:

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error


---

In [57]:

X_train = data.drop(columns=[target])

X_train = X_train.iloc[:, 0:62].copy()
y_train = data[target].copy()

scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(
    data=scaler.transform(X_train), 
    columns=list(X_train.columns)
)


test_raw = blogData_test_read()
X_test = test_raw.iloc[:,0:280]

X_test = X_test.iloc[:, 0:62].copy()
y_test = test_raw.iloc[:,-1]

# using train scaler
X_test = scaler.transform(X_test)



---

In [58]:
eval_models = []

# eval_models.append(Eval_Estimator(
#     name='Gradient Boosting Regressor', 
#     estimator=GradientBoostingRegressor(), 
#     params= {
#         'learning_rate': 0.01, # shrinks the contribution of each tree
#         'max_depth' : 4,
#         'min_samples_split' : 5,
#         'n_estimators' : 500, # boosting stages
#         'subsamples' : 1, # value < 1.0 reduce variance and increase bias
#     }
# ))


eval_models.append(Eval_Estimator(
    name='XGBoost L1 y L2', 
    estimator=xgb.XGBRegressor(), 
    params= {
        'eval_metric' : 'rmse', # root mean square error
        'gamma': 1, # (min_split_loss) minimum loss reduction
        'learning_rate': 0.2, # (eta) step size shrinkage
        'max_depth': 8, # maximum depth of tree
        # 'max_delta_step' : 0.5, # for classification extremely imbalanced
        'n_estimators': 1000, 
        'n_jobs': -2, # jobs in parallel -2 : all processors minus one
        'objective' : 'reg:squarederror', # regression with squared loss
        'random_state': 127, 
        'reg_alpha': 1000, # (alpha) L1 regularization
        'reg_lambda': 1000, # (lambda) L2 regularization
        'subsample' : 0.1, # prevents overfitting
    }
))


eval_models.append(Eval_Estimator(
    name='XGBoost L1 y L2', 
    estimator=xgb.XGBRegressor(), 
    params= {
        'eval_metric' : 'rmse', # root mean square error
        'gamma': 1, # (min_split_loss) minimum loss reduction
        'learning_rate': 0.2, # (eta) step size shrinkage
        'max_depth': 8, # maximum depth of tree
        # 'max_delta_step' : 0.5, # for classification extremely imbalanced
        'n_estimators': 1000, 
        'n_jobs': -2, # jobs in parallel -2 : all processors minus one
        'objective' : 'reg:squarederror', # regression with squared loss
        'random_state': 127, 
        # 'reg_alpha': 1000, # (alpha) L1 regularization
        # 'reg_lambda': 1000, # (lambda) L2 regularization
        'subsample' : 0.1, # prevents overfitting
    }
))


In [59]:
eval_columns = ['model', 'params', 'train RMSE', 'test RMSE']
eval_result = pd.DataFrame(columns=eval_columns)

for m in eval_models :
    m.estimator.set_params(**m.params)
    m.estimator.fit(X_train, y_train)
    
    y_pred_train = m.estimator.predict(X_train)
    rmse_train = ROUND(np.sqrt(mean_squared_error(y_train, y_pred_train)))

    y_pred_test = m.estimator.predict(X_test)
    rmse_test = ROUND(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    
    eval_result = eval_result.append(
        pd.Series(
            data=[m.name, 
                  m.params, 
                  rmse_train, 
                  rmse_test
                 ], 
            index=eval_result.columns), 
        ignore_index=True
    )

display(eval_result)

Unnamed: 0,model,params,train RMSE,test RMSE
0,XGBoost L1 y L2,"{'eval_metric': 'rmse', 'gamma': 1, 'learning_...",24.0195,22.5527
1,XGBoost L1 y L2,"{'eval_metric': 'rmse', 'gamma': 1, 'learning_...",12.4041,33.7544


In [22]:
if False :
    for m in eval_models :
        print(m.name, m.estimator.get_params())
        print()

---

---