In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

Populating the interactive namespace from numpy and matplotlib


In [3]:
import time
import pandas as pd
import numpy as np
from sklearn import cross_validation as cv
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
from sklearn.cross_validation import KFold
from matplotlib import pyplot as plt

from joblib import Parallel, delayed

plt.style.use('ggplot')

In [4]:
df = pd.read_csv("./spam.train.txt", delim_whitespace=True, header=None)
x_data = df.ix[:, 1:].as_matrix(); y_data = df.ix[:, 0].as_matrix()
x_train, x_data2, y_train, y_data2 = cv.train_test_split(x_data, y_data,
                                                       test_size=0.5, random_state=288)

x_train2, x_test, y_train2, y_test = cv.train_test_split(x_data2, y_data2,
                                                       test_size=0.4, random_state=288)

In [5]:
len(y_train), len(y_train2), len(y_test)

(3546, 2128, 1419)

In [6]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import SVR

In [7]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error

In [8]:
def search_params(estimator, params, x_train, y_train, scoring="mean_squared_error", cv=5):
    gsc = GridSearchCV(estimator, params, cv=cv, scoring=scoring, n_jobs=-1)
    gsc.fit(x_train, y_train)
    print("Best parameters set found on development set:")
    print(gsc.best_params_)
    print("Grid scores on development set:")
    for params, mean_score, scores in gsc.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()
    scores = [a.mean_validation_score for a in gsc.grid_scores_]
    return gsc.best_estimator_, gsc.best_params_ , scores

### GradientBoostingClassifier

##### n_estimators

In [None]:
n_estimators_grid = [10, 15, 30, 60, 70, 100, 200, 350]
gbc, _, gbc_scores = search_params(GradientBoostingRegressor(), {"n_estimators": n_estimators_grid},
                                   x_train, y_train)

In [None]:
plt.plot(n_estimators_grid, gbc_scores)
plt.title("GradientBoostingClassifier")
plt.ylabel("f1")
plt.xlabel("n_estimators");

##### learning_rate

In [None]:
learning_rate_grid = [0.01, 0.05, 0.1, 0.3, 0.6, 1.0, 2.0, 6.0]
gbc, _, gbc_scores = search_params(GradientBoostingRegressor(n_estimators=250), {"learning_rate": learning_rate_grid},
                                   x_train, y_train)

In [None]:
plt.plot(learning_rate_grid, gbc_scores)
plt.title = "GradientBoostingClassifier"
plt.ylabel = "f1"
plt.xlabel = "learning_rate"

In [None]:
gbc_params = {
    "n_estimators": 170,
    "learning_rate": 0.6
}

gbr_params = {
    "n_estimators": 250,
    "learning_rate": 0.1
}

### AdaBoostClassifier

##### n_estimators

In [None]:
n_estimators_grid = [10, 15, 30, 60, 70, 100, 200, 350]
abc, _, abc_scores = search_params(AdaBoostRegressor(), {"n_estimators": n_estimators_grid},
                                   x_train, y_train)

In [None]:
plt.plot(n_estimators_grid, abc_scores)
plt.title("GradientBoostingClassifier")
plt.ylabel("f1")
plt.xlabel("n_estimators")

##### learning_rate

In [None]:
learning_rate_grid = [0.01, 0.05, 0.1, 0.3, 0.6, 1.0]
learning_rate_grid = [0.0001, 0.001, 0.01, 0.1]
abc, _, abc_scores = search_params(AdaBoostRegressor(n_estimators=250), {"learning_rate": learning_rate_grid},
                                   x_train, y_train)

In [None]:
plt.plot(learning_rate_grid, abc_scores)
plt.ylabel = "f1"
plt.xlabel = "n_estimators"

In [None]:
abc_params = {
    "n_estimators": 200,
    "learning_rate": 0.6
}

abr_params = {
    "n_estimators": 250,
    "learning_rate": 0.01
}

### Linear SVM

In [None]:
mean_squared_error(y_test, LinearSVR().fit(x_train, y_train).predict(x_test))

In [None]:
from sklearn.svm import LinearSVR
from sklearn.grid_search import GridSearchCV

In [None]:

C_grid = [0.1, 1.0, 10.0, 100.0, 300, 400, 700, 1000.0]

gsc = GridSearchCV(LinearSVR(), {"C": C_grid}, cv=5, scoring="mean_squared_error")
gsc.fit(x_train, y_train)
print("Best parameters set found on development set:")
print(gsc.best_params_)
print("Grid scores on development set:")
for params, mean_score, scores in gsc.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))
print()
scores = [a.mean_validation_score for a in gsc.grid_scores_]
plt.plot(C_grid, scores)
plt.ylabel = "f1"
plt.xlabel = "C"

In [None]:
gamma_grid = [0.001, 0.01, 0.1, 0.6, 1.0, 3.0]

gsc = GridSearchCV(LinearSVR(C=0.1), {"epsilon": gamma_grid}, cv=5, scoring="mean_squared_error")
gsc.fit(x_train, y_train)
print("Best parameters set found on development set:")
print(gsc.best_params_)
print("Grid scores on development set:")
for params, mean_score, scores in gsc.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))
print()
scores = [a.mean_validation_score for a in gsc.grid_scores_]
plt.plot(gamma_grid, scores)
plt.ylabel = "f1"
plt.xlabel = "C"

In [None]:
svr_params = {
    "C": 0.1,
    "epsilon": 0.1
}

In [None]:
mean_squared_error(y_test, LinearSVR(**svr_params).fit(x_train, y_train).predict(x_test))

### RBF SVM

In [None]:
C_grid = [0.1, 1.0, 10.0, 100.0, 300, 400, 700, 1000.0]
svc, _, svc_scores = search_params(SVR(), {"C": C_grid},
                                   x_train, y_train)
plt.plot(C_grid, svc_scores)
plt.ylabel = "f1"
plt.xlabel = "C"

In [None]:
gamma_grid = [0.001, 0.01, 0.1, 0.6, 1.0, 3.0]
svc, _, svc_scores = search_params(SVR(C=300), {"gamma": gamma_grid},
                                   x_train, y_train)
plt.plot(gamma_grid, svc_scores)
plt.ylabel = "f1"
plt.xlabel = "gamma"

In [None]:
svc_params = {
    "C": 800,
    "gamma": 1.0
}

In [None]:
svr_params = {
    "C": 300,
    "gamma": 0.6
}

In [None]:
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error

In [None]:
gbc = GradientBoostingRegressor(**gbr_params)
gbc.fit(x_train, y_train)
np.abs(mean_squared_error(y_test, gbc.predict(x_test)))

In [None]:
abc = AdaBoostRegressor(**abr_params)
abc.fit(x_train, y_train)
np.abs(mean_squared_error(y_test, abc.predict(x_test)))

In [None]:
svc = SVR(**svr_params)
svc.fit(x_train, y_train)
np.abs(mean_squared_error(y_test, svc.predict(x_test)))

Weighted Voting

In [None]:
x_gbc = gbc.predict(x_train2)
x_abc = abc.predict(x_train2)
x_svc = svc.predict(x_train2)

In [None]:
np.vstack([x_gbc, x_abc, x_svc]).t

In [None]:
x_composition = np.vstack([x_gbc, x_abc, x_svc]).T

In [None]:
x_composition.shape, y_train2.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [None]:
lr_l1 = LinearSVC(penalty='l1', dual=False)

In [None]:
lr_l1.fit(x_composition, y_train2)

In [None]:
weights = np.array(lr_l1.coef_/np.sum(lr_l1.coef_))

In [None]:
y_pred = predict([gbc, abc, svc], weights, x_test)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.svm import LinearSVC, LinearSVR

In [None]:
def predict(estimators, weights, x):
    x_comp = np.vstack([e.predict(x) for e in estimators]).T
    return np.sum(x_comp*weights, axis=1).ravel()

In [None]:
svr_params2 = {
    "C": 0.1,
    "epsilon": 0.1
}

In [None]:
# svr = SVR(**svr_params)
svr = LinearSVR(**svr_params2)
svr.fit(x_train, y_train)
x_svr = svr.predict(x_train2)

lsvc = LinearSVC(penalty='l1', dual=False)

def learn_composition(n_estimators):
    gbr = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=0.1)
    gbr.fit(x_train, y_train)
    abr = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=0.01)
    abr.fit(x_train, y_train)
    
    x_gbr = gbr.predict(x_train2)
    x_abr = abr.predict(x_train2)
    
    x_comp = np.vstack([x_gbr, x_abr, x_svr]).T
    
    lsvc.fit(x_comp, y_train2)
    weights = np.array(lsvc.coef_/np.sum(lsvc.coef_))
    
    y_pred = predict([gbr, abr, svr], weights, x_test)
    
    return mean_squared_error(y_test, y_pred)
    

In [None]:
n_estimators_grid = [2, 10, 50, 100, 150, 200, 250, 300]
scores = Parallel(n_jobs=8)(delayed(learn_composition)(n_estimators) for n_estimators in n_estimators_grid)

In [None]:
plt.plot(n_estimators_grid, scores)
plt.ylabel = "mse"
plt.xlabel = "n_estimators_grid"

In [None]:
def score_gbr(n_estimators):
    gbr = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=0.1)
    gbr.fit(x_train, y_train)
    y_pred = gbr.predict(x_test)
    return mean_squared_error(y_test, y_pred)

def score_abr(n_estimators):
    abr = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=0.01)
    abr.fit(x_train, y_train)
    y_pred = abr.predict(x_test)
    return mean_squared_error(y_test, y_pred)

In [None]:
gbr_scores = Parallel(n_jobs=-1)(delayed(score_gbr)(n_estimators) for n_estimators in n_estimators_grid)

In [None]:
abr_scores = Parallel(n_jobs=-1)(delayed(score_abr)(n_estimators) for n_estimators in n_estimators_grid)

In [None]:
svr_mse = mean_squared_error(y_test, svr.predict(x_test))

In [None]:
plt.figure(figsize=(12,10))
plt.plot(n_estimators_grid, scores, 'g', label="Composition")
plt.plot(n_estimators_grid, gbr_scores, 'b', label="GradientBoosting")
plt.plot(n_estimators_grid, abr_scores, 'r', label="AdaBoost")
plt.plot(n_estimators_grid, [svr_mse]*len(n_estimators_grid), 'black', label="SVM")
plt.ylabel = "mse"
plt.xlabel = "n_estimators_grid"
plt.legend(loc='upper right')

In [None]:
from src.my_composition import MyComposition

In [None]:
my = MyComposition()
my.fit(x_train, y_train, x_train2, y_train2)
mean_squared_error(y_test, my.predict(x_test))

In [None]:
tmp2 = my.estimators_[2]
len(tmp2.feature_importances_)

In [None]:
def predict_on_fsubset(features, x_train, y_train, x_train2, y_train2, x_test, y_test):
    start = time.time()
    # clf = AdaBoostClassifier(**ada_best_params)
    clf = MyComposition()
    clf.fit(x_train[:, features], y_train, x_train2[:, features], y_train2)
    y_pred = clf.predict(x_test[:, features])
    score = mean_squared_error(y_test, y_pred)

    end = time.time()
    return clf, score, end - start

In [None]:
def fset2scores(fset, x_train, x_test, y_train, y_test):
    fpacks = [fset[:i] for i in range(1, len(fset) + 1)]
    res = Parallel(n_jobs=-1)(delayed(predict_on_fsubset)(pack, x_train, y_train, x_train2, y_train2, x_test, y_test) for pack in fpacks)

    clfs, scores, times = zip(*res)
    return clfs, scores, times

In [None]:
from src.cfs import cfs, cfs1

In [None]:
cfs1_features = cfs1(x_train, y_train, x_train.shape[1])

In [None]:
_, cfs1_scores, cfs1_times = fset2scores(cfs1_features, x_train, x_test, y_train, y_test)

In [None]:
cfs_features = cfs(x_train, y_train, x_train.shape[1])

In [None]:
_, cfs_scores, cfs_times = fset2scores(cfs_features, x_train, x_test, y_train, y_test)

In [None]:
plt.figure(figsize=(15, 15))
# plt.plot(range(len(wrapper_features)), wrapper_scores, 'g', label="wrapper")
# plt.plot(range(len(embedded_features)), embedded_scores, 'b', label="embedded")
plt.plot(range(len(cfs_features)), cfs_scores, 'r', label="cfs")
plt.plot(range(len(cfs1_features)), cfs1_scores, 'y', label="cfs1")

plt.title('Methods Score')

plt.xlabel('n_features')
plt.ylabel('mse')
plt.legend(loc='lower right')

In [None]:
from src.my_composition import MyComposition
my = MyComposition().fit(x_train[:, [34]], y_train, x_train2[:, [34]], y_train2)

In [None]:
my.predict(x_test[:, [34]])