In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
data = blogData_train_read()
data.shape

(49203, 281)

In [4]:
# data

In [5]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]
    
    labels['tl_post'] = columns[60:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


In [6]:

labels = blogData_labels(data)
target = 'comments'


---

In [7]:
ROUND = lambda v: round(v, 4)

---

In [8]:

from sklearn.preprocessing import StandardScaler


---
<a name="Regression_Models"></a>
### Regression Models

In [9]:

class RM_Estimator :
    u'''
    '''
    
    def __init__(self, name, estimator, gs_param_grid=None) :
        self.name = name
        self.estimator = estimator
        self.gs_param_grid = gs_param_grid
        
        return    

In [10]:

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error


In [11]:
def rm_evaluate(rm_result, rm_models, X_train, y_train) :
    u'''
    '''
    
    for rm in rm_models :

        scoring = 'neg_root_mean_squared_error'
        cv = StratifiedKFold(n_splits=3, random_state=11, shuffle=True)
        
        gs = GridSearchCV(
            estimator=rm.estimator, # scikit-learn estimator interface
            param_grid=rm.gs_param_grid, # dictionart key=parametrer, value=list of paraameter posible values
            scoring=scoring, # strategy to evaluate performance of cross-validated
            n_jobs=-2, # jobs in parallel -2 : all processors minus one
            refit=True, # refit estimator using best parameters
            cv=cv, # cross-validated splitting strategy
            return_train_score=False, # include training scores
            verbose=1 # display fold parameters, score, time, ...
        )
        
        print('Gridsearch para', rm.name, '...')

        gs_time = time.time()
        gs.fit(X_train, y_train)
        gs_time = ROUND(time.time() - gs_time)
        
        y_pred = gs.predict(X_train)
        gs_rmse = ROUND(np.sqrt(mean_squared_error(y_train, y_pred)))

        rm_result = rm_result.append(
            pd.Series(
                data=[rm.name, 
                      gs.best_params_, 
                      gs_time, 
                      gs_rmse
                     ], 
                index=rm_result.columns
                ),
            ignore_index=True
        )

    return rm_result
    

In [12]:

rm_models = []

# rm_models.append(
#     RM_Estimator(
#         name='Linear Regression',
#         estimator=LinearRegression(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Ridge',
#         estimator=Ridge(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Lasso',
#         estimator=Lasso(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )


# rm_models.append(
#     RM_Estimator(
#         name='Elastic Net',
#         estimator=ElasticNet(),
#         gs_param_grid={
#             'alpha' : [1.0], 
#             'l1_ratio' : [0, 0.5, 1] # 0 : no L2 penalty (Ridge);  1 : no L1 penalty (Lasso)
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='K-Nearest Neighbors',
#         estimator=KNeighborsRegressor(),
#         gs_param_grid={
#             'n_jobs' : [-2], 
#             'n_neighbors' : [5, 10], 
#             'p' : [2], # euclidian_distance
#             'weights' : ['uniform'] # equally weighted
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Random Forest Regressor',
#         estimator=RandomForestRegressor(),
#         gs_param_grid={
#             'max_depth' : [3], 
#             'n_estimators' : [500], 
#             'n_jobs' : [-2], 
#             'random_state' : [127]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Gradient Boosting Regressor',
#         estimator=GradientBoostingRegressor(),
#         gs_param_grid={
#             'learning_rate' : [0.1, 0.2], 
#             'max_depth' : [3], 
#             'n_estimators' : [500], 
#             'random_state' : [127], 
#             'verbose' : [0]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='XGBoost (default)',
#         estimator=xgb.XGBRegressor(),
#         gs_param_grid={
#             'gamma' : [0], # (min_split_loss) minimum loss reduction
#             'learning_rate' : [0.3], # (eta) step size shrinkage
#             'max_depth' : [6], # maximum depth of tree
#             'n_estimators' : [500], 
#             'n_jobs' : [-2], # jobs in parallel -2 : all processors minus one
#             'random_state' : [127], 
#             'reg_alpha' : [0], # (alpha) L1 regularization
#             'reg_lambda' : [1] # (lambda) L2 regularization
#         }
#     )
# )

rm_models.append(
    RM_Estimator(
        name='XGBoost L1 y L2',
        estimator=xgb.XGBRegressor(),
        gs_param_grid={
            'gamma' : [1], 
            'learning_rate' : [0.2], 
            'max_depth' : [12], 
            'n_estimators' : [1000], 
            'n_jobs' : [-2], # jobs in parallel -2 : all processors minus one
            'random_state' : [127], 
            'reg_alpha' : [1000], # L1 regularization
            'reg_lambda' : [1000] # L2 regularization
        }
    )
)



---

In [13]:

X_train = data.drop(columns=[target])
y_train = data[target].copy()

scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(
    data=scaler.transform(X_train), 
    columns=list(X_train.columns)
)



---

In [14]:
# ignore code
if False :
    rm_columns = ['model', 'params', 'time', 'RMSE']
    rm_result = pd.DataFrame(columns=rm_columns)

    rm_result = rm_evaluate(rm_result, rm_models, X_train, y_train)

    pd.options.display.max_colwidth = 500 
    rm_result.sort_values(by=['RMSE'], axis='index')

---
{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 12, 'n_estimators': 1000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000} : RMSE -> 14.91 (*)

{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 1000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000} : RMSE -> 15.45

{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 1000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000} : RMSE -> 16.36

{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 2000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000} : RMSE -> 17.69 

{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 1000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000} : RMSE -> 17.69

{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 1000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1500, 'reg_lambda': 1500} : RMSE -> 19.52


---

In [19]:
model = xgb.XGBRegressor()


In [27]:
model_params = {
    'gamma': 1, 
    'learning_rate': 0.2, 
    'max_depth': 12, 
    'n_estimators': 1000, 
    'n_jobs': -2, 
    'random_state': 127, 
    'reg_alpha': 1000, 
    'reg_lambda': 1000
}

model.set_params(
    gamma=1, 
    learning_rate=0.2, 
    max_depth=12, 
    n_estimators=1000, 
    n_jobs=-2, 
    random_state=127, 
    reg_alpha=1000, 
    reg_lambda=1000
)

model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=1, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.2, max_delta_step=0,
             max_depth=12, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=-2,
             num_parallel_tree=1, predictor='auto', random_state=127,
             reg_alpha=1000, reg_lambda=1000, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [51]:
result = pd.DataFrame(data=pd.Series(data=list(model.feature_importances_), name='value'))

result.insert(loc=0, column='feature', value=pd.Series(data.columns, name='feature'))
result.sort_values(by='value', ascending=False, inplace=True)

In [49]:
result.head(30)

Unnamed: 0,feature,value
9,median_nc_24_before_BT,0.157462
21,std_nc_diff_24_48,0.153757
6,std_nc_24_before_BT,0.123043
20,media_nc_diff_24_48,0.070029
1,std_nc_total_before_BT,0.042128
0,media_nc_total_before_BT,0.025601
58,nl_first_24_BT,0.024752
54,nc_diff_24_48,0.012668
10,media_nc_between_24_48,0.010701
51,nc_24_before_BT,0.010268


In [56]:
pepe = xgb.XGBRegressor()

In [55]:
model_params

{'gamma': 1,
 'learning_rate': 0.2,
 'max_depth': 12,
 'n_estimators': 1000,
 'n_jobs': -2,
 'random_state': 127,
 'reg_alpha': 1000,
 'reg_lambda': 1000}

In [57]:
pepe.set_params(**model_params)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=1, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.2, max_delta_step=None, max_depth=12,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=-2, num_parallel_tree=None,
             predictor=None, random_state=127, reg_alpha=1000, reg_lambda=1000,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)