In [164]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor, RandomForestClassifier, VotingClassifier, VotingRegressor

## Datos

In [3]:
df = pd.read_csv("OnlineNewsPopularity.csv").sample(frac = 0.1).reset_index(drop = True)

In [4]:
df.columns = [x.strip() for x in df.columns]

In [5]:
df["success"] = (df["shares"] > df["shares"].quantile(.9))*1

In [10]:
ls_cont = ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 
           'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 
           'average_token_length', 'num_keywords', 'kw_min_min', 'kw_max_min', 'kw_avg_min', 
           'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'kw_avg_avg', 
           'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess', 
           'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 
           'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words',
           'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
           'max_positive_polarity', 'avg_negative_polarity', 'min_negative_polarity', 
           'max_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity', 
           'abs_title_subjectivity', 'abs_title_sentiment_polarity']
target = "shares"
target_disc = "success"

In [22]:
df = df[df[target] <= 32000].reset_index(drop = True)

In [23]:
X = df[ls_cont]
yr = df[target]
yc = df[target_disc]

## Bosque Aleatorio

In [36]:
bos = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_leaf=0.05)

In [37]:
ls_score = cross_val_score(estimator=bos, X = X, y=yc, n_jobs=-1, scoring="roc_auc", cv = 4)

In [38]:
np.mean(ls_score), np.std(ls_score)

(0.6928196851118734, 0.022353961886567655)

In [39]:
bos = RandomForestRegressor(n_estimators=100, n_jobs=-1, min_samples_leaf=0.05)

In [40]:
ls_score = cross_val_score(estimator=bos, X = X, y=yr, n_jobs=-1, scoring="r2", cv = 4)

In [41]:
np.mean(ls_score), np.std(ls_score)

(0.06756483538999083, 0.013408170088452381)

## AdaBoost

In [57]:
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.05)

In [58]:
ls_score = cross_val_score(estimator=ada, X = X, y=yc, n_jobs=-1, scoring="roc_auc", cv = 4)

In [59]:
np.mean(ls_score), np.std(ls_score)

(0.66965455343284, 0.006980603062856358)

In [66]:
ada = AdaBoostRegressor(n_estimators=100, learning_rate=0.0005)

In [67]:
ls_score = cross_val_score(estimator=ada, X = X, y=yr, n_jobs=-1, scoring="r2", cv = 4)

In [68]:
np.mean(ls_score), np.std(ls_score)

(0.054402384635942574, 0.010808752130748727)

## Gradient Boosting

In [69]:
GradientBoostingClassifier?

In [70]:
gb = GradientBoostingClassifier(learning_rate=0.05, min_samples_leaf=0.05)

In [71]:
ls_score = cross_val_score(estimator=gb, X = X, y=yc, n_jobs=-1, scoring="roc_auc", cv = 4)

In [72]:
np.mean(ls_score), np.std(ls_score)

(0.6771029811941018, 0.0263389424240537)

In [73]:
gb = GradientBoostingRegressor(learning_rate=0.05, min_samples_leaf=0.05)

In [76]:
ls_score = cross_val_score(estimator=gb, X = X, y=yr, n_jobs=-1, scoring="r2", cv = 4)

In [77]:
np.mean(ls_score), np.std(ls_score)

(0.06996605488490032, 0.006828868008042681)

## Voting

In [78]:
VotingClassifier?

In [113]:
naiveb = GaussianNB()
logreg = LogisticRegression()
dctree = DecisionTreeClassifier()

In [114]:
vc = VotingClassifier(estimators=[("logreg", logreg), ("arbol", dctree), ("bayes", naiveb)], voting='soft')

In [115]:
ls_score = cross_val_score(estimator=vc, X = X, y=yc, n_jobs=-1, scoring="roc_auc", cv = 4)

In [116]:
np.mean(ls_score), np.std(ls_score)

(0.6345477879821149, 0.01883613122708365)

In [160]:
ridger = Ridge(alpha=0.01)
linreg = LinearRegression()
dctree = DecisionTreeRegressor()

In [161]:
vc = VotingRegressor(estimators=[("linreg", linreg), ("arbol", dctree), ("ridge", ridger)])

In [162]:
ls_score = cross_val_score(estimator=vc, X = X, y=yr, n_jobs=-1, scoring="r2", cv = 4)

In [163]:
np.mean(ls_score), np.std(ls_score)

(-0.043148270455015125, 0.010034290516192436)

## XGBoost

In [169]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=190, max_depth=5, min_child_weight=2, objective="binary:logistic", subsample=0.9, colsample_bytree=0.8, seed=23333)

In [170]:
ls_score = cross_val_score(estimator=xgb, X = X, y=yc, n_jobs=-1, scoring="roc_auc", cv = 4)

In [171]:
np.mean(ls_score), np.std(ls_score)

(0.6447178972304166, 0.012616516865521576)

In [179]:
XGBRegressor?

In [213]:
xgb = XGBRegressor(n_estimators=100)

In [214]:
ls_score = cross_val_score(estimator=xgb, X = X, y=yr, n_jobs=-1, scoring="r2", cv = 4)

In [215]:
np.mean(ls_score), np.std(ls_score)

(0.017430831176658085, 0.02984098529976593)