In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
data = blogData_train_read()
data.shape

(49203, 281)

In [4]:
# data

In [5]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]
    
    labels['tl_post'] = columns[60:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


In [6]:

labels = blogData_labels(data)
target = 'comments'


---

In [7]:
ROUND = lambda v: round(v, 4)

---

In [8]:

from sklearn.preprocessing import StandardScaler


---
<a name="Regression_Models"></a>
### Regression Models

In [9]:

class RM_Estimator :
    u'''
    '''
    
    def __init__(self, name, estimator, gs_param_grid=None) :
        self.name = name
        self.estimator = estimator
        self.gs_param_grid = gs_param_grid
        
        return    

In [10]:

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error


In [11]:


def rm_gridsearch(estimator, param_grid) :
    u''' Regression Model Grid Search for this final work
    
    '''

    scoring = 'neg_root_mean_squared_error'
    refit = True
    cv = StratifiedKFold(n_splits=5, random_state=11, shuffle=True)
    
    gs = GridSearchCV(
        estimator=estimator, # scikit-learn estimator interface
        param_grid=param_grid, # dictionart key=parametrer, value=list of paraameter posible values
        scoring=scoring, # strategy to evaluate performance of cross-validated
        n_jobs=-2, # jobs in parallel -2 : all processors minus one
        refit=refit, # refit estimator using best parameters
        cv=cv, # cross-validated splitting strategy
        return_train_score=False # include training scores
    )

    return gs


In [12]:

rm_models = []

# rm_models.append(
#     RM_Estimator(
#         name='Linear Regression',
#         estimator=LinearRegression(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Ridge',
#         estimator=Ridge(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Lasso',
#         estimator=Lasso(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )


# rm_models.append(
#     RM_Estimator(
#         name='Elastic Net',
#         estimator=ElasticNet(),
#         gs_param_grid={
#             'alpha' : [1.0], 
#             'l1_ratio' : [0, 0.5, 1] # 0 : no L2 penalty (Ridge);  1 : no L1 penalty (Lasso)
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='K-Nearest Neighbors',
#         estimator=KNeighborsRegressor(),
#         gs_param_grid={
#             'n_neighbors' : [5, 10], 
#             'weights' : ['uniform'], # equally weighted
#             'p' : [2], # euclidian_distance
#             'n_jobs' : [-2]
#         }
#     )
# )

rm_models.append(
    RM_Estimator(
        name='Random Forest Regressor',
        estimator=RandomForestRegressor(),
        gs_param_grid={
            'n_estimators' : [100, 200], 
            # 'max_depth' : [3], 
            # 'bootstrap' : [True], 
            # 'n_jobs' : [-1], 
            # 'max_samples' : [0.3], 
            'random_state' : [127]
        }
    )
)

rm_models.append(
    RM_Estimator(
        name='Gradient Boosting Regressor',
        estimator=GradientBoostingRegressor(),
        gs_param_grid={
            # 'learning_rate' : [0.1], 
            'n_estimators' : [100, 200], 
            # 'subsample' : [1.0], 
            # 'criterion' : ['friedman_mse'], 
            # 'max_depth' : [3], 
            'random_state' : [127]
        }
    )
)



---

In [13]:
def rm_evaluate(rm_result, rm_models, X_train, y_train) :
    u'''
    '''
    
    rm_columns = rm_result.columns
    
    for rm in rm_models :

        print('Gridsearch para', rm.name, '...', end=' ')

        gs_time = time.time()
        gs = rm_gridsearch(rm.estimator, rm.gs_param_grid)
        gs.fit(X_train, y_train)
        gs_time = ROUND(time.time() - gs_time)
        
        print(gs_time, 'Segundos')

        y_pred = gs.predict(X_train)
        gs_rmse = ROUND(np.sqrt(mean_squared_error(y_train, y_pred)))

        rm_result = rm_result.append(
            pd.Series(
                data=[rm.name, 
                      # gs.best_estimator_, 
                      gs.best_params_, 
                      gs_time, gs_rmse], 
                index=rm_columns
                ),
            ignore_index=True
        )

    return rm_result
    

In [14]:

X_train = data.drop(columns=[target])
y_train = data[target].copy()

scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(
    data=scaler.transform(X_train), 
    columns=list(X_train.columns)
)

rm_columns = ['model', 'params', 'time', 'RMSE']
rm_result = pd.DataFrame(columns=rm_columns)


---

In [15]:

rm_result = rm_evaluate(rm_result, rm_models, X_train, y_train)

Gridsearch para Random Forest Regressor ... 931.647 Segundos
Gridsearch para Gradient Boosting Regressor ... 109.9059 Segundos


---

In [16]:
pd.options.display.max_colwidth = 500 
rm_result.sort_values(by=['RMSE'], axis='index')

Unnamed: 0,model,params,time,RMSE
0,Random Forest Regressor,"{'n_estimators': 200, 'random_state': 127}",931.647,10.1364
1,Gradient Boosting Regressor,"{'n_estimators': 100, 'random_state': 127}",109.9059,21.5453
