In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Carregando os pickles

In [3]:
def read_pickle(name):
    with open(name, 'rb') as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance

In [4]:
X_train = read_pickle('X_train.pickle')
X_test = read_pickle('X_test.pickle')
y_train = read_pickle('y_train.pickle')
y_test = read_pickle('y_test.pickle')

In [5]:
y_train.shape

(179,)

## Treinamento do modelo

In [6]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Avaliação do modelo

In [7]:
predictions = lin_reg.predict(X_test)

In [8]:
print(predictions.shape)
print(predictions)

(45,)
[  9.13004753   2.83624567   1.92152875   1.21787584   3.06499711
  -1.4607147    3.3138375   12.59788164   5.83844528  16.40845834
  13.73054377   4.59142163  19.63558727   8.65292422  71.64575436
   4.62661705   1.14745001   2.1552973   21.59214249  42.22017373
 276.79540245   8.20318348   8.97691839   3.0101234    2.8818395
  86.2318544   11.27970652   4.27321409  37.46031878   9.46510128
   1.00566847   3.6711758    1.76169072  10.19229868   4.48342671
  25.81105946   6.2603026   59.43870035   5.00267097   1.90372581
   2.5975345   -0.81984305   0.40858471   3.05630069  30.56664463]


In [9]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

7.620697190587371


## Decision Tree Regressor

In [10]:
from sklearn.tree import DecisionTreeRegressor

In [11]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [12]:
dt_predictions = dt_reg.predict(X_test)

In [13]:
print(dt_predictions.shape)
print(dt_predictions)

(45,)
[  3.   3.   1.   4.   0.   1.   2.   8.   6.  22.  12.   4.   4.   1.
  51.   6.   3.  10.   7.  49. 134.   6.   6.   4.   3.  67.  15.   1.
  26.   6.   2.   5.   1.   6.   6.  17.   9.  16.   1.   2.   4.   2.
   3.   4.  16.]


In [14]:
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_rmse = np.sqrt(dt_mse)
print(dt_rmse)

24.388977473896322


## Random Forest Regressor

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [16]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [17]:
rf_reg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [18]:
params_grid = {
    'bootstrap': [True, False],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [100, 200, 400, 600],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [4, 5, 6, None],
}

In [None]:
cv_rf = GridSearchCV(estimator=rf_reg, param_grid=params_grid)
cv_rf.fit(X_train, y_train)

In [None]:
cv_rf.best_params_

In [None]:
rf_predictions = rf_reg.predict(X_test)

In [None]:
print(rf_predictions.shape)
print(rf_predictions)

In [None]:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
print(rf_rmse)