In [None]:
#Imports of all necessary Python Packages
import sklearn as skl
import numpy as np
import pandas as pd
from time import time


# Regressors

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import TheilSenRegressor

from sklearn.isotonic import IsotonicRegression

from sklearn.kernel_ridge import KernelRidge

from sklearn.svm import SVR

from sklearn.neighbors import KNeighborsRegressor

from sklearn.gaussian_process import GaussianProcess

# Metrics

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

#Decomposition
from sklearn.decomposition import PCA

In [None]:
#Here import the data to classify
data = np.random.rand(100,5)
labels = np.random.rand(100,1).ravel()

In [None]:
# Feature Extraction
pca = PCA(n_components = 2)
data_decomposed = pca.fit_transform(data)

In [None]:
# Generate a few of the most common Regression Models

#regressIso = IsotonicRegression()

regressAdaB = AdaBoostRegressor()

regressDTree = DecisionTreeRegressor()

regressRidge = RidgeCV()

regressElNet = ElasticNetCV()

regressKRidge = KernelRidge()

regressSVM = SVR()

regressLassoLars = LassoLarsCV()

regressLR =  LinearRegression()

regressARDR = ARDRegression()

regressTSR = TheilSenRegressor()

regressKNN = KNeighborsRegressor()

regressGP = GaussianProcess()

In [None]:
#Test one Regressor on the Data without Cross Validation

regressor = regressLassoLars
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.20, random_state=42)

t0 = time()
regressor.fit(train_data,train_labels)
print "Score of Classifier: " + str(regressor.score(test_data,test_labels))
print("done in %0.3fs." % (time() - t0))

In [None]:
#Analyze Residuals

x= test_labels-regressor.predict(test_data)

x = x[np.isfinite(x)]

# the histogram of the data
n, bins, patches = plt.hist(x, 20, normed=1, facecolor='green', alpha=0.75)

mu = np.mean(x)
sigma = np.std(x)
# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=1)

plt.xlabel('Residuals')
plt.ylabel('Probability Density function')
plt.title('Residuals of current regression model')
plt.grid(True)

plt.show()

In [None]:
# Test multiple regressors on the Data:

regressor_lst = [regressAdaB, regressGP, regressKNN, regressTSR,regressARDR, regressLR, regressLassoLars,
                 regressSVM, regressKRidge, regressRidge, regressElNet, 
                 regressDTree]

train_data, test_data, train_labels, test_labels = train_test_split(data_decomposed,
                                                                    labels, test_size=0.20, random_state=42)
for regress in regressor_lst:
    print regress
    t0 = time()
    regress.fit(train_data,train_labels)
    print "Score of Classifier: " + str(regress.score(test_data,test_labels))
    print("done in %0.3fs." % (time() - t0))

In [None]:
# Select parameters to use in Cross-Validation
regressCV = regressKNN
data_cv = data_decomposed
N_CV = 10

# Cross Validation
t0 = time()
scores = cross_val_score(regressCV,data_cv,labels, n_jobs=-1, cv = N_CV)
print "Scores: "
for i,score in enumerate(scores):
    print '\t' + str(i) + ':\t' + str(score) 
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
print("\nCross val done in %0.3fs." % (time() - t0))

In [None]:
# Fine Tune with Grid-Search

pipeline = Pipeline([
    ('pca',PCA()),
    ('lassLARS', LassoLarsCV(verbose=True, n_jobs=-1))
    ])

# Fine tune parameters using exaustive GridSearch:

parameters = {
    'pca__n_components': (1,2,3,4,5),
    'lassLARS__fit_intercept':(True,False)
    }
    
grid_search = GridSearchCV(pipeline, parameters,  verbose=1, n_jobs=-1)

In [None]:
t0 = time()
grid_search.fit(data, labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))