# Métodos kernel escalables

### 1. Carga los datos usando la función "fetch_openml".

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml

house = fetch_openml(name='house_16H')# , version=4)

  " {version}.".format(name=name, version=res[0]['version']))


### 2. Preprocesa los datos y realiza alguna visualización.

In [2]:
df = pd.DataFrame(data = house.data,    # values
                  columns = house.feature_names)  # 1st row as the column names

df['price'] = house.target

df.head()

Unnamed: 0,P1,P5p1,P6p2,P11p4,P14p9,P15p1,P15p3,P16p2,P18p2,P27p4,H2p2,H8p2,H10p1,H13p1,H18pA,H40p4,price
0,15512.0,0.460869,0.049252,0.22647,0.149827,0.752837,0.010057,0.579729,0.003251,0.075912,0.625318,0.036613,0.991377,0.260116,0.052246,0.774059,130600.0
1,1550.0,0.470968,0.002581,0.137419,0.096341,0.862581,0.0,0.695142,0.005025,0.043551,0.064263,0.00335,0.994975,0.285266,0.060606,0.142857,40500.0
2,4741.0,0.485341,0.000211,0.189412,0.135656,0.856992,0.0,0.683584,0.004143,0.027965,0.065796,0.0,0.997411,0.315433,0.065116,0.6875,28700.0
3,467.0,0.498929,0.0,0.100642,0.08547,0.907923,0.0,0.780488,0.006098,0.018293,0.057471,0.0,1.0,0.149425,0.139535,1.0,28500.0
4,310.0,0.474193,0.680645,0.225806,0.128834,0.896774,0.0,0.756302,0.008403,0.016807,0.077519,0.672269,0.991597,0.147287,0.0,0.0,24100.0


In [3]:
# load Plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go

# initiate the Plotly Notebook mode
init_notebook_mode(connected=True)

In [4]:
# define a function for 3D plotting using Plotly
def plot_3D(X, y):
    
    trace0 = go.Scatter3d(x = X.iloc[:,0], 
                          y = X.iloc[:,1], 
                          z = y,
                          mode = 'markers',
                          marker = dict(size=6, color='blue', opacity=0.8)
    )

    # set aspect ratio
    scene = dict(aspectmode="manual", aspectratio=dict(x = 1, y = 1, z = 1))

    # define figure properties
    layout = go.Layout(
        scene = scene,
        height = 600,
        width = 900
    )

    # produce the plot
    fig = go.Figure(data = trace0,
                    layout = layout)
    iplot(fig)

In [5]:
# plot the data in 3D

nEvents = 500
if nEvents > df.shape[0]: 
    nEvents = df.shape[0]

Phi = df.iloc[0:nEvents,[2,6]]

y = df.iloc[0:nEvents, -1]

plot_3D(Phi,y)

### 3. Separa los datos en train (2/3) y test (1/3).

In [6]:
from sklearn.model_selection import train_test_split

# test sample gets 1/3 of the total events
test_size = 1/3

# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,0:-1], df.iloc[:,-1], test_size=test_size, random_state=0)

In [8]:
X_train.head()

Unnamed: 0,P1,P5p1,P6p2,P11p4,P14p9,P15p1,P15p3,P16p2,P18p2,P27p4,H2p2,H8p2,H10p1,H13p1,H18pA,H40p4
12646,4591.0,0.483119,0.003267,0.156611,0.126844,0.857983,0.041168,0.755189,0.003663,0.031135,0.07874,0.002442,0.95116,0.288526,0.146893,0.53125
8704,1458.0,0.484911,0.038409,0.137174,0.139814,0.898491,0.0,0.756007,0.0,0.027726,0.095318,0.038817,0.985213,0.299331,0.115385,0.5
17209,86874.0,0.489076,0.02232,0.103875,0.065719,0.806789,0.00655,0.65368,0.00316,0.086559,0.044672,0.020304,0.980786,0.32978,0.048929,0.164706
16348,20.0,0.6,0.0,0.25,0.375,0.65,0.0,0.363636,0.0,0.0,0.3125,0.0,1.0,0.3125,0.0,0.0
12402,1214.0,0.470346,0.001647,0.121911,0.083981,0.867381,0.0,0.681818,0.002273,0.047727,0.041394,0.0,0.995455,0.309368,0.148387,0.0


### 4. Entrena un regresor lineal para predecir la variable target (el precio) a partir de las 16 características de entrada. Calcula el Mean Absolute Error (MAE) sobre el conjunto de test. Debería ser alrededor de 25000 EUR.

In [9]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model
regr.fit(X_train, y_train)

# Obtain predictions corresponding to X_test
y_pred = regr.predict(X_test)

# Compute the Mean Absolute Error
MAE = mean_absolute_error(y_test, y_pred)
print(MAE)

25453.41193386269


### 5. Ahora entrena el método kernel ridge regression usando la aproximación Nyström o RFF y reporta su MAE sobre el conjunto de test. 

### Ojo: el regresor en este caso consiste en un Pipeline que contiene la aproximación seguida por una regresión Ridge (lineal); No contiene el método KernelRidge explícitamente. Para encontrar parámetros adecuados puedes usar el método GridSearchCV.

In [22]:
from sklearn.kernel_approximation import Nystroem
from sklearn import pipeline
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

# create a pipeline from kernel approximation and linear svm (Nyström)
feature_map_nystroem = Nystroem(random_state=0) #gamma=gamma, n_components=n_components, random_state=0)
nystroem_approx_krr = pipeline.Pipeline([("feature_map", feature_map_nystroem),
                                         ("ridge", LinearRegression())])

print(nystroem_approx_krr.get_params().keys())
print("-------------------------------------")
print("")

# Define the grid
param_grid = {'feature_map__gamma': [0.00001, 0.0001, 0.0005, 0.001, 0.005, 0.01],
              'feature_map__n_components': [10, 20, 50, 100, 500, 1000],}


# Get the best parameters and train the model
grid = GridSearchCV(nystroem_approx_krr, param_grid, cv = 3)
grid.fit(X_train, y_train)
nystroem_approx_krr = grid.best_estimator_
print(grid.best_params_)
print("-------------------------------------")
print("")

# Obtain predictions corresponding to X_test
y_pred = nystroem_approx_krr.predict(X_test)

# Compute the Mean Absolute Error
MAE = mean_absolute_error(y_test, y_pred)
print(MAE)

dict_keys(['memory', 'steps', 'verbose', 'feature_map', 'ridge', 'feature_map__coef0', 'feature_map__degree', 'feature_map__gamma', 'feature_map__kernel', 'feature_map__kernel_params', 'feature_map__n_components', 'feature_map__random_state', 'ridge__copy_X', 'ridge__fit_intercept', 'ridge__n_jobs', 'ridge__normalize'])
-------------------------------------
{'feature_map__gamma': 1e-05, 'feature_map__n_components': 500}
-------------------------------------
25069.314692500626
