### Trying differents models

In [2]:
from preparingData import preparing_data # read data
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from math import sqrt # RMSE
from sklearn.metrics import mean_squared_error # error metric
# models
from sklearn import linear_model, tree, svm
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor

#### Preparing data

Loading data, defining features and target and training and testing sets. 

Using IQR score, measure of statistical dispersion, to treat outliers. See more in: https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

In [9]:
X,y=preparing_data()
X_scaled = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

IQR use droped out 53572 outliers samples. RMSE of Tree Regressor model with and without IQR: 3852.7015889440468 and 5703.552899898522. And for Linears Models: 
12073.589535066065 to 7954.5921481577525.

In [82]:
X.shape,X_out.shape

((852122, 6), (798550, 6))

#### Models

We tried _RadiusNeighborsRegressor_(interrupts kernel), _GaussianProcessRegressor_(memory error) and _KernelRidge_(memory error), but they return errors. 

Linear Models got same RMSEs(12073.2602195047).

_KNeighborsRegressor_ got a big one RMSE (13469.975288326723).

PLS Regression RMSE was the same as Linear Models.

In [12]:
def rmse(model):
    return sqrt(mean_squared_error(y_test, model.predict(X_test)))

In [4]:
## LINEAR
reg = linear_model.LinearRegression()
reg.fit(X_train,y_train)
reg.coef_

array([ 7.61862799e+02, -8.42749569e-02, -2.69134549e-01,  2.50627801e+01,
       -6.24265563e+01,  1.20014398e+00])

In [5]:
print(rmse(reg))

7954.5921481577525

In [25]:
reg = linear_model.LinearRegression()
reg = linear_model.Ridge(alpha=.1)
reg = linear_model.RidgeCV(normalize=True, cv=20)
reg = linear_model.LassoLars(alpha=.1)
reg = linear_model.BayesianRidge()

In [85]:
print(rmse(reg))


12073.260219504702

In [33]:
reg.fit(X_train,y_train)
reg.coef_
reg

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

In [30]:
reg1.fit(X_train,y_train)


Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [10]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor
clf = linear_model.SGDRegressor(max_iter=10000, tol=1e-3)
clf.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=10000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=0.001, verbose=0,
       warm_start=False)

In [8]:
print(rmse(clf)) # without IQR

12075.100854291673

In [13]:
print(rmse(clf)) # with IQR

7955.736603127719


In [13]:
## ARVORE
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)
clf.predict(X_test)

array([15250., 13112., 26905., ..., 20995., 31500., 23500.])

In [14]:
print(rmse(clf))

5703.552899898522

In [18]:
## KNeighborsRegressor 
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train) 
print(rmse(neigh))

13469.975288326723

In [65]:
## PLSRegression
pls2 = PLSRegression(n_components=6)
pls2.fit(X_train, y_train)

PLSRegression(copy=True, max_iter=500, n_components=6, scale=True, tol=1e-06)

In [66]:
print(rmse(pls2))

12073.2602195047

In [None]:
## SVM
clf = svm.SVR()
clf.fit(X_train, y_train)
print(rmse(clf))