In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC, SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, DistanceMetric

In [3]:
df = pd.read_csv("/home/oscar/Downloads/OnlineNewsPopularity.csv").sample(frac = 0.1).reset_index(drop = True)

In [4]:
df.columns = [x.strip() for x in df.columns]

In [5]:
df["success"] = (df["shares"] > df["shares"].quantile(.9))*1

In [10]:
ls_cont = ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 
           'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 
           'average_token_length', 'num_keywords', 'kw_min_min', 'kw_max_min', 'kw_avg_min', 
           'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'kw_avg_avg', 
           'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess', 
           'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 
           'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words',
           'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
           'max_positive_polarity', 'avg_negative_polarity', 'min_negative_polarity', 
           'max_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity', 
           'abs_title_subjectivity', 'abs_title_sentiment_polarity']
target = "shares"
target_disc = "success"

In [11]:
X = df[ls_cont]
yr = df[target]
yc = df[target_disc]

## Análisis Discriminante 

In [12]:
lda = LinearDiscriminantAnalysis()

In [13]:
lda.fit(X, yc)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [14]:
ls_res = cross_val_score(estimator = lda, X=X, y=yc, cv=4, n_jobs=-1, scoring="roc_auc")

In [15]:
np.mean(ls_res), np.std(ls_res)

(0.6593938261539158, 0.018113985763204887)

In [20]:
LinearDiscriminantAnalysis?

In [16]:
lda.get_params()

{'n_components': None,
 'priors': None,
 'shrinkage': None,
 'solver': 'svd',
 'store_covariance': False,
 'tol': 0.0001}

In [18]:
param_grid = {"solver": ["svd", "lsqr", "eigen"],
              "shrinkage": [None, "auto"] + [x/10 for x in range(10)]}

In [19]:
param_grid

{'solver': ['svd', 'lsqr', 'eigen'],
 'shrinkage': [None, 'auto', 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}

In [22]:
rs = GridSearchCV(cv=4, error_score=-1000, estimator=lda, n_jobs=-1, scoring="roc_auc", param_grid=param_grid, verbose=True)

In [23]:
%%time
rs.fit(X=X, y=yc)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.6s


CPU times: user 258 ms, sys: 54.3 ms, total: 313 ms
Wall time: 5.8 s


[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:    5.7s finished


GridSearchCV(cv=4, error_score=-1000,
       estimator=LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'solver': ['svd', 'lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=True)

In [24]:
rs.best_score_

0.6602516193323368

## Regresión Cresta Kernel

In [25]:
krr = KernelRidge()

In [26]:
%%time
krr.fit(X=X, y = yr)

CPU times: user 5.59 s, sys: 1.06 s, total: 6.65 s
Wall time: 2.47 s


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number4.111161e-17
  overwrite_a=False)


KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [27]:
ls_res = cross_val_score(estimator = krr, X=X, y=yr, cv=4, n_jobs=-1, scoring="r2")

In [28]:
np.mean(ls_res), np.std(ls_res)

(-0.19042384748129637, 0.19196780256081886)

In [48]:
KernelRidge?

In [31]:
param_grid = {"alpha": [x/10 for x in range(10)] + [x*10 for x in range(10)],
              "kernel": ["linear", np.sin]}

In [32]:
param_grid

{'alpha': [0.0,
  0.1,
  0.2,
  0.3,
  0.4,
  0.5,
  0.6,
  0.7,
  0.8,
  0.9,
  0,
  10,
  20,
  30,
  40,
  50,
  60,
  70,
  80,
  90],
 'kernel': ['linear', <ufunc 'sin'>]}

In [34]:
rs = GridSearchCV(cv=4, error_score=-1000, estimator=krr, n_jobs=-1, scoring="r2", param_grid=param_grid, verbose=True)

In [35]:
%%time
rs.fit(X, yr)

Fitting 4 folds for each of 40 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  3.2min finished


CPU times: user 5.79 s, sys: 520 ms, total: 6.31 s
Wall time: 3min 16s


GridSearchCV(cv=4, error_score=-1000,
       estimator=KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90], 'kernel': ['linear', <ufunc 'sin'>]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=True)

In [36]:
rs.best_score_

-0.181364895848203

In [39]:
rs.cv_results_



{'mean_fit_time': array([6.46761880e+01, 4.23079729e-03, 3.13126093e+00, 7.84343481e-03,
        2.92090482e+00, 9.88078117e-03, 3.07459784e+00, 6.85167313e-03,
        2.88706124e+00, 1.33948326e-02, 2.94100690e+00, 6.05779886e-03,
        2.94278944e+00, 8.53347778e-03, 2.93894809e+00, 6.64395094e-03,
        3.42075068e+00, 1.40696764e-02, 4.24858004e+00, 1.01991296e-02,
        6.53092871e+01, 2.24252939e-02, 3.12547243e+00, 2.68763304e-03,
        2.92345989e+00, 1.00499988e-02, 3.18445009e+00, 7.70258904e-03,
        3.00466913e+00, 6.56622648e-03, 2.92380363e+00, 9.64224339e-03,
        2.86147428e+00, 1.58238411e-02, 3.04034972e+00, 5.40673733e-03,
        3.09952956e+00, 6.85232878e-03, 2.74685270e+00, 1.54141784e-02]),
 'std_fit_time': array([5.39287141e-01, 2.71369369e-03, 1.52286247e-01, 6.96274110e-03,
        2.37340573e-01, 7.04333009e-03, 1.28884013e-01, 5.47111229e-03,
        6.58061255e-02, 1.07670429e-02, 2.38208074e-01, 4.89664590e-03,
        1.20427547e-01, 7.270

In [38]:
rs.best_estimator_

KernelRidge(alpha=90, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

## Support Vector Machine (Clasificación)

In [41]:
svm = SVC()

In [42]:
svm.fit(X, yc)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [43]:
ls_res = cross_val_score(estimator = svm, X=X, y=yc, cv=4, n_jobs=-1, scoring="roc_auc")

In [44]:
np.mean(ls_res), np.std(ls_res)

(0.5002802690582959, 0.0002802690582959233)

In [64]:
param_grid = {"C": [x/10 for x in range(10)] + [x*10 for x in range(10)],
              "kernel": ['linear', 'poly', 'rbf', 'sigmoid']}

In [76]:
# rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=svm, n_jobs=-1, scoring="roc_auc", param_distributions=param_grid, verbose=True, n_iter=2)

In [78]:
# %%time
# rs.fit(X, yc)

## Support Vector Machine (Regresión)

In [45]:
svm = SVR()

In [46]:
svm.fit(X, yr)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
ls_res = cross_val_score(estimator = svm, X=X, y=yc, cv=4, n_jobs=-1, scoring="r2")

In [48]:
np.mean(ls_res), np.std(ls_res)

(-0.07202639761166357, 0.012458620898133172)

In [49]:
SVR?

In [50]:
param_grid = {"C": [x/10 for x in range(10)] + [x*10 for x in range(10)],
              "epsilon": [x/10 for x in range(10)] + [x*10 for x in range(10)]}

In [51]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=svm, n_jobs=-1, scoring="r2", param_distributions=param_grid, verbose=True, n_iter=20)

In [52]:
%%time
rs.fit(X, yr)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   40.9s finished


CPU times: user 1.33 s, sys: 23.8 ms, total: 1.36 s
Wall time: 42.1 s


RandomizedSearchCV(cv=4, error_score=-1000,
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,
          param_distributions={'C': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90], 'epsilon': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='r2', verbose=True)

In [53]:
rs.best_score_

-0.06276670295248538

## K-Vecinos (clasificación)

In [54]:
knn = KNeighborsClassifier()

In [55]:
knn.fit(X, yc)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [56]:
ls_res = cross_val_score(estimator = knn, X=X, y=yc, cv=4, n_jobs=-1, scoring="roc_auc")

In [57]:
np.mean(ls_res), np.std(ls_res)

(0.5663331747973004, 0.023960555265195912)

In [58]:
KNeighborsClassifier?

In [59]:
DistanceMetric?

In [63]:
param_grid = {"n_neighbors": range(100),
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski", "seuclidean", "mahalanobis", ]}

In [64]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=knn, n_jobs=-1, scoring="roc_auc", param_distributions=param_grid, verbose=True, n_iter=20)

In [65]:
%%time
rs.fit(X, yc)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.0s


CPU times: user 157 ms, sys: 30.7 ms, total: 188 ms
Wall time: 48.1 s


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   48.0s finished


RandomizedSearchCV(cv=4, error_score=-1000,
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,
          param_distributions={'n_neighbors': range(0, 100), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'seuclidean', 'mahalanobis']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=True)

In [66]:
rs.best_score_

0.6456238392897585

In [67]:
rs.best_estimator_

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='seuclidean',
           metric_params=None, n_jobs=None, n_neighbors=78, p=2,
           weights='uniform')

## K-Vecinos (Regresión) 

In [68]:
knn = KNeighborsRegressor()

In [69]:
knn.fit(X, yr)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [70]:
ls_res = cross_val_score(estimator = knn, X=X, y=yc, cv=4, n_jobs=-1, scoring="r2")

In [71]:
np.mean(ls_res), np.std(ls_res)

(-0.1386056917314893, 0.040018434569156866)

In [73]:
param_grid = {"n_neighbors": range(100),
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski", "seuclidean", "mahalanobis", ]}

In [74]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=knn, n_jobs=-1, scoring="r2", param_distributions=param_grid, verbose=True, n_iter=100)

In [75]:
%%time
rs.fit(X, yr)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


CPU times: user 156 ms, sys: 21.2 ms, total: 177 ms
Wall time: 1min 37s


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.6min finished


RandomizedSearchCV(cv=4, error_score=-1000,
          estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_neighbors': range(0, 100), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'seuclidean', 'mahalanobis']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='r2', verbose=True)

In [76]:
rs.best_score_

0.004767815612115035

In [77]:
rs.best_estimator_

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='seuclidean',
          metric_params=None, n_jobs=None, n_neighbors=83, p=2,
          weights='uniform')