In [1]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import os
from sklearn.linear_model import LinearRegression as LR
from sklearn import linear_model
import sklearn.tree
# help(sklearn.tree)
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import ExtraTreesRegressor as ETR

In [2]:
with open('data/datasets.pkl', 'rb') as f:
    d = pickle.load(f)
    
X_train = d["train"]["X"]
y_train = d["train"]["Y"]
print (X_train.shape)
print (y_train.shape)

X_validation = d["validation"]["X"]
y_validation = d["validation"]["Y"]
print (X_validation.shape)
print (y_validation.shape)

X_test = d["test"]["X"]
y_test = d["test"]["Y"]
print (X_test.shape)
print (y_test.shape)

(945215, 175)
(945215,)
(202545, 175)
(202545,)
(202546, 175)
(202546,)


In [3]:
def get_cost(model, X, y, weights=0):
    pred = model.predict(X)
#     weights = model.coef_
    m = len(y)
    return np.sum((pred - y)**2)/2./m

In [6]:
models = [RFR(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)]

train_costs = []
validation_costs = []
test_costs = []
for clf in models:
    print ("Training: " + str(clf))
    clf.fit(X_train, y_train)
    
    train_cost = get_cost(clf, X_train, y_train)
    validation_cost = get_cost(clf, X_validation, y_validation)
    test_cost = get_cost(clf, X_test, y_test)
    
    train_costs.append(train_cost)
    validation_costs.append(validation_cost)
    test_costs.append(test_cost)
    
    print (clf)
    print ("Training cost: %d, validation cost: %d, test cost: %d\n" %(train_cost, validation_cost, test_cost))

Training: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Training cost: 42, validation cost: 212, test cost: 218



In [12]:
models = [LR(n_jobs=-1), linear_model.Ridge(alpha=0.00001), DTR(min_samples_split=10), GBR(min_samples_split=10), RFR(min_samples_split=10)]

train_costs = []
validation_costs = []
test_costs = []
for clf in models:
    print ("Training: " + str(clf))
    clf.fit(X_train, y_train)
    
    train_cost = get_cost(clf, X_train, y_train)
    validation_cost = get_cost(clf, X_validation, y_validation)
    test_cost = get_cost(clf, X_test, y_test)
    
    train_costs.append(train_cost)
    validation_costs.append(validation_cost)
    test_costs.append(test_cost)
    print ("Training cost: %d, validation cost: %d, test cost: %d\n" %(train_cost, validation_cost, test_cost))

Training: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
Training cost: 511, validation cost: 550, test cost: 520

Training: Ridge(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Ridge(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Training cost: 511, validation cost: 550, test cost: 520

Training: DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_d

In [15]:
# ABR = ABR()
# ABR.fit(X_train, y_train)

# train_cost = get_cost(ABR, X_train, y_train)
# validation_cost = get_cost(ABR, X_validation, y_validation)
# test_cost = get_cost(ABR, X_test, y_test)

# print ("Training cost: %d, validation cost: %d, test cost: %d\n" %(train_cost, validation_cost, test_cost))

In [17]:
ETR = ETR(n_jobs=-1)
ETR.fit(X_train, y_train)

train_cost = get_cost(ETR, X_train, y_train)
validation_cost = get_cost(ETR, X_validation, y_validation)
test_cost = get_cost(ETR, X_test, y_test)

print ("Training cost: %d, validation cost: %d, test cost: %d\n" %(train_cost, validation_cost, test_cost))

Training cost: 14, validation cost: 267, test cost: 296



In [None]:
from sklearn.neural_network import MLPRegressor
layer_sizes = [500, 300, 200, 50, 10]
alpha = 0.005
batch_size = 200
# learning_rate = "adaptive"
learning_rate_init = 0.0008
MPLR= MLPRegressor(hidden_layer_sizes=layer_sizes, alpha=alpha, batch_size=batch_size, learning_rate_init=learning_rate_init, verbose=5)
MPLR.fit(X_train, y_train)

Iteration 1, loss = 271.06213029
Iteration 2, loss = 216.59963985
Iteration 3, loss = 194.16647129


In [None]:
train_cost = get_cost(MPLR, X_train, y_train)
validation_cost = get_cost(MPLR, X_validation, y_validation)
test_cost = get_cost(MPLR, X_test, y_test)

print ("Training cost: %d, validation cost: %d, test cost: %d\n" %(train_cost, validation_cost, test_cost))