# Test of our results

In this file, we demontsrate how well our model performs compared to the baseline and we find the best parameters for our preprocessing algorithm.

In [3]:
import pandas as pd
from pipeline import get_full_pipeline_with_model
from baseline_pipeline import get_baseline_full_pipeline
from sklearn.model_selection import train_test_split

data = pd.read_csv('CarsData.csv')

full_pipeline_with_model = get_full_pipeline_with_model(data, 'price')
baseline_full_pipeline = get_baseline_full_pipeline(data, 'price')

y = data.price
X = data.drop('price', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

Here we search for the best parameters:

In [4]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")


grid_params = [
    {
        'select_from_model__threshold' : [50, 100, 200],
        'feature_extraction__method': ['PCA'],
        'feature_extraction__n_components': [0.9, 0.95, 0.99]
    },
    {
        'select_from_model__threshold' : [50, 100, 200],
        'feature_extraction__method': ['LDA'],
        'feature_extraction__n_components': [15, 30, 50]
    }
]

grid = GridSearchCV(full_pipeline_with_model, grid_params, verbose=3, cv = 3, scoring = 'neg_mean_squared_error')
grid.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[Pipeline] ......... (step 1 of 5) Processing transform, total=   0.1s
[Pipeline] ............. (step 2 of 5) Processing dense, total=   0.0s
[Pipeline] . (step 3 of 5) Processing select_from_model, total=   0.8s
PCA 0.9
[Pipeline]  (step 4 of 5) Processing feature_extraction, total=   0.2s
Iteration 1, loss = 10119393.06951630
Iteration 2, loss = 6990575.40649996
Iteration 3, loss = 6563364.73673316
Iteration 4, loss = 6360087.10664787
Iteration 5, loss = 6258423.84217725
Iteration 6, loss = 6165563.63787481
Iteration 7, loss = 6021690.78435800
Iteration 8, loss = 5847821.31295034
Iteration 9, loss = 5693274.41143966
Iteration 10, loss = 5543840.09152872
[Pipeline] ............. (step 5 of 5) Processing model, total=  11.5s
[CV 1/3] END feature_extraction__method=PCA, feature_extraction__n_components=0.9, select_from_model__threshold=50;, score=-11185956.683 total time=  12.7s
[Pipeline] ......... (step 1 of 5) Processing tr

In [6]:
grid.best_params_

{'feature_extraction__method': 'LDA',
 'feature_extraction__n_components': 30,
 'select_from_model__threshold': 200}

In [7]:
model = grid.best_estimator_
model

Here we compute the results (error) after preprocessing:

In [None]:
pip = full_pipeline_with_model.fit(X_train, y_train)

In [8]:
import numpy as np
predicted = model.predict(X_test)
np.sqrt(np.sum((predicted -  y_test) ** 2) / predicted.shape[0])

2720.486519244561

Here we compute the results without preprocessing:

In [None]:
sim = baseline_full_pipeline
sim.fit(X_train, y_train)

In [4]:
import numpy as np
predicted = sim.predict(X_test)
np.sqrt(np.sum((predicted -  y_test) ** 2) / predicted.shape[0])

3769.899483967153

We can see a clear improvement.