# AML Task 1

In [80]:
#@title Imports

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.linear_model import Ridge, HuberRegressor, RANSACRegressor, LogisticRegression, LinearRegression, BayesianRidge, ARDRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, IsolationForest, BaggingRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import SelectKBest, SelectFromModel, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.linear_model import TheilSenRegressor
from xgboost import XGBRegressor
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPRegressor

from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SequentialFeatureSelector

from scipy.stats import levene

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

In [81]:
use_drive = True
if use_drive:
  PATH = "drive/My Drive/AML_HS22/task1/"
  from google.colab import drive
  drive.mount('/content/drive')
else:
  PATH = "./"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
#@title Load data and generate training validation split

def load_data():
    X = np.asarray(pd.read_csv(PATH + "X_train.csv"))
    y = np.asarray(pd.read_csv(PATH + "y_train.csv"))
    X_test = np.asarray(pd.read_csv(PATH + "X_test.csv"))

    # delete id column
    X = np.delete(X, 0, 1)
    y = np.delete(y, 0, 1)
    X_test = np.delete(X_test, 0, 1)
    print("X shape %s" %str(X.shape))
    print("y shape %s" %str(y.shape))
    print("y.ravel() shape %s" %str(y.ravel().shape))
    print("X_test shape %s" %str(X_test.shape))
    
    return X, y, X_test


X, y, X_test = load_data()


X shape (1212, 832)
y shape (1212, 1)
y.ravel() shape (1212,)
X_test shape (776, 832)


In [83]:
#@title Feature Select
# # sel = SelectFromModel(RandomForestRegressor(n_estimators=1000, min_samples_leaf=5, n_jobs=-1))
# sel = SequentialFeatureSelector(HistGradientBoostingRegressor(random_state=1), n_features_to_select=90) # back to 1000 estimators
# # sel = SequentialFeatureSelector(RandomForestRegressor(n_estimators=10, min_samples_leaf=2, n_jobs=-1), n_features_to_select=20) # back to 1000 estimators
# # sel = SelectFromModel(RandomForestRegressor(n_estimators=100, min_samples_leaf=5, n_jobs=-1), max_features=25)
# sel.fit(X, y.ravel())
# X = sel.transform(X)
# #X_train = sel.transform(X_train)
# #X_val = sel.transform(X_val)
# X_test = sel.transform(X_test)
# print("X shape %s" %str(X.shape))

In [84]:
#@title Impute

print("before impute: %s" %str(X[1][:10]))

imputer = SimpleImputer(strategy='median')
# imputer = IterativeImputer()
# imputer = KNNImputer(n_neighbors=10, weights='distance')
imputer.fit(X)
# X_train = imputer.transform(X_train)
# X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)
X = imputer.transform(X)

print("after impute: %s" %str(X[1][:10]))

before impute: [1.77570376e+04            nan 4.10101627e+03 9.29595276e+04
            nan 9.98551677e+01 1.00139594e+04 1.08266075e+04
 1.00761016e+04 1.14369696e+01]
after impute: [1.77570376e+04 1.08394831e+04 4.10101627e+03 9.29595276e+04
 1.05029940e+02 9.98551677e+01 1.00139594e+04 1.08266075e+04
 1.00761016e+04 1.14369696e+01]


In [85]:
#@title Precompute
def simple_precompute(X, y, X_test):

  # normalize data
  scaler = MinMaxScaler()
  scaler.fit(X)
  X = scaler.fit_transform(X)
  X_test = scaler.fit_transform(X_test)

  # remove low variance features
  sel = VarianceThreshold(threshold=(0.001))
  X = sel.fit_transform(X)
  X_test = sel.fit_transform(X_test)

  b = []
  for i in range(len(X[0])):
    stat, p = levene(y[0], X[:, i], center = 'mean')
    if p < 0.1:
      b.append(True)
    else:
      b.append(False)
  # print(b)
  X = X[:, b]
  X_test = X_test[:, b]
  print("X shape %s" %str(X.shape))
  print("X_test shape %s" %str(X_test.shape))

  #yX = np.c_[y, X].T
  #print("yX shape %s" %str(yX.shape))
  #cov = np.absolute(np.cov(yX))
  #tr = np.triu(cov, k=0)
  #print("cov shape %s" %str(tr.shape))
  #print(tr)


  return X, y, X_test

# simple_precompute(X,y,X_test)
# X, y, X_test = simple_precompute(X,y,X_test)

In [86]:
#@title Feature Select
# sel = SelectFromModel(RandomForestRegressor(n_estimators=1000, min_samples_leaf=5, n_jobs=-1))
sel = SelectFromModel(RandomForestRegressor(n_estimators=1000, min_samples_leaf=5, n_jobs=-1, random_state=1), threshold="mean") # back to 1000 estimators
# sel = RFECV(RandomForestRegressor(n_estimators=25, min_samples_leaf=5, n_jobs=-1, random_state=1), step=10, verbose=10) # back to 1000 estimators
# sel = RFE(RandomForestRegressor(n_estimators=10, min_samples_leaf=5, n_jobs=-1, random_state=1), n_features_to_select=90, verbose=10) # back to 1000 estimators
# sel = SelectFromModel(XGBRegressor(objective='reg:squarederror', n_estimators=300, random_state=2), max_features=90)
# sel = SelectFromModel(RandomForestRegressor(n_estimators=100, min_samples_leaf=5, n_jobs=-1), max_features=25)
sel.fit(X, y.ravel())
X = sel.transform(X)
#X_train = sel.transform(X_train)
#X_val = sel.transform(X_val)
X_test = sel.transform(X_test)
print("X shape %s" %str(X.shape))

X shape (1212, 87)


In [87]:
#@title Outlier Filtering

def filter_data(X, y, X_test):
  clf = IsolationForest(max_samples=100, random_state=3)
  tmp = np.concatenate([X, X_test])
  print("X/X_test shape %s" %str(tmp.shape))
  clf.fit(tmp)
  X_booleans = clf.predict(X)
  X_out = []
  y_out = []
  for i, b in enumerate(X_booleans):
    if b == 1:
      X_out.append(X[i])
      y_out.append(y[i])
  X = np.asarray(X_out)
  y = np.asarray(y_out)
  
  print("X shape %s" %str(X.shape))
  print("y shape %s" %str(y.shape))

  return X, y

X, y = filter_data(X, y, X_test)



X/X_test shape (1988, 87)
X shape (1104, 87)
y shape (1104, 1)


In [88]:
#@title cross validation on random forest to get a feel how well it does
et = RandomForestRegressor(n_estimators=300, min_samples_leaf=2, n_jobs=-1, random_state=2)
cval = cross_validate(et, X, y, scoring='r2', n_jobs=-1)
score = np.average(cval['test_score'])
print("RandomForestRegressor: %f" %score)

# ablation study
# RandomForestRegressor: 0.500328 KNN-imputer, outlier filtering, feature selection
# RandomForestRegressor: 0.592574 KNN-imputer, feature selection, outlier filtering
# RandomForestRegressor: 0.599434 median imputer, selectfrommodel rf 82 threshold=mean features feature selection, outlier filtering
# RandomForestRegressor: 0.514511 median imputer, selectfrommodel rf 82 threshold=mean features feature selection, no outlier filtering
# RandomForestRegressor: 0.598990 median imputer, selectfrommodel rf 82 threshold=mean features feature selection, outlier filtering IsolationForest200
# RandomForestRegressor: 0.595044 median imputer, selectfrommodel rf 82 threshold=mean features feature selection, outlier filtering IsolationForest200 bootstrap
# RandomForestRegressor: 0.596161 median imputer, selectfrommodel rf 60 threshold=1.25*mean features feature selection, outlier filtering
# RandomForestRegressor: 0.591115 median imputer, selectfrommodel rf 120 threshold=0.75*mean features feature selection, outlier filtering
# RandomForestRegressor: 0.50.... median imputer, selectfrommodel rf 82 threshold=median features feature selection, outlier filtering
# RandomForestRegressor: 0.596890 median imputer, selectfrommodel rf 50 features feature selection, outlier filtering
# RandomForestRegressor: 0.587515 median imputer, selectfrommodel xgboost 90 features feature selection, outlier filtering
# RandomForestRegressor: 0.583112 with RFECV 562 features

RandomForestRegressor: 0.596254


In [89]:
#@title Split Data
def split_data(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
    y_train = y_train.ravel()
    y_val = y_val.ravel()
    y = y.ravel()

    print("X shape %s" %str(X.shape))
    print("y shape %s" %str(y.shape))
    print("X_train shape %s" %str(X_train.shape))
    print("y_train shape %s" %str(y_train.shape))
    print("X_val shape %s" %str(X_val.shape))
    print("y_val shape %s" %str(y_val.shape))

    return X_train, X_val, y_train, y_val, X, y

# X_train, X_val, y_train, y_val, X, y = split_data(X, y)

In [90]:
#@title model comparison on cross validation
def compare_models():
  hsgr = HistGradientBoostingRegressor(max_iter=300, random_state=2)
  cval = cross_validate(hsgr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("HistGradientBoostingRegressor: %f" %score)

  xgb = XGBRegressor(n_estimators=300, random_state=2)
  cval = cross_validate(xgb, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("XGB: %f" %score)

  rf = RandomForestRegressor(n_estimators=100, max_features=15, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest: %f" %score)

  svr = make_pipeline(StandardScaler(), SVR(kernel="rbf", C=1.0, epsilon=0.2))
  cval = cross_validate(svr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("SVR: %f" %score)

  et = ExtraTreesRegressor(n_estimators=100, max_features=15, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(et, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("ExtraTrees: %f" %score)

  gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=0, loss='squared_error')
  cval = cross_validate(gbr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("GradientBoosting: %f" %score)

  abr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=0)
  cval = cross_validate(abr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("AdaBoost: %f" %score)

  estimators = [('xgb', xgb), ('gbr', gbr), ('hsgr', hsgr)]
  vr = VotingRegressor(estimators)
  cval = cross_validate(vr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("Voting 3: %f" %score)

  sr = StackingRegressor(estimators=estimators, final_estimator=Ridge())
  cval = cross_validate(sr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("Stacking 3: %f" %score)

  estimators = [('rf', rf), ('et', et), ('gbr', gbr), ('abr', abr)]
  vr = VotingRegressor(estimators)
  cval = cross_validate(vr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("Voting 4: %f" %score)

  sr = StackingRegressor(estimators=estimators, final_estimator=Ridge())
  cval = cross_validate(sr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("Stacking 4: %f" %score)

  estimators = [('rf', rf), ('et', et), ('xgb', xgb), ('gbr', gbr), ('hsgr', hsgr), ('abr', abr)]
  vr = VotingRegressor(estimators)
  cval = cross_validate(vr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("Voting 6: %f" %score)

  sr = StackingRegressor(estimators=estimators, final_estimator=Ridge())
  cval = cross_validate(sr, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("Stacking 6: %f" %score)

# compare_models()

# simple median imputer
# HistGradientBoostingRegressor: 0.628836
# XGB: 0.615902
# RandomForest: 0.595371
# SVR: 0.466644
# ExtraTrees: 0.579479
# GradientBoosting: 0.605367
# AdaBoost: 0.594406
# Voting 3: 0.635512
# Stacking 3: 0.639591
# Voting 4: 0.613904
# Stacking 4: 0.628063
# Voting 6: 0.631099
# Stacking 6: 0.645067

In [None]:
#@title Ensemble
# fit on entire dataset
hsgr = HistGradientBoostingRegressor(max_iter=1000, random_state=2)
rf = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, n_jobs=-1, random_state=2)
et = ExtraTreesRegressor(n_estimators=1000, min_samples_leaf=2, n_jobs=-1, random_state=2)
gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=5, random_state=0, loss='squared_error')
abr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=3000, random_state=0)
xgb = XGBRegressor(n_estimators=1000, random_state=2)
estimators = [('rf', rf), ('et', et), ('xgb', xgb), ('gbr', gbr), ('hsgr', hsgr), ('abr', abr)]
sr = StackingRegressor(estimators=estimators, final_estimator=Ridge())
clf = sr.fit(X, y)

y_test = sr.predict(X_test)
print(y_test[:10])

# for ridge
# ids = np.asarray(list(range(len(y_test)))).reshape((-1, 1)).astype(int)
# print(ids[:10])
# output_arr = np.hstack((ids, y_test))

# for SVR


  y = column_or_1d(y, warn=True)


[58.0670046  77.8361733  71.99262782 75.80547739 73.58093443 56.13041615
 63.38074222 70.33375836 66.6017805  60.58107339]


In [None]:
#@title Output
y_test = clf.predict(X_test)
print(y_test[:10])
ids = np.asarray(list(range(len(y_test)))).astype(int)
output_arr = np.column_stack((ids, y_test))
print(output_arr[:10])
np.savetxt(PATH + 'y_test.csv', output_arr, delimiter=',', header="id,y", comments='', fmt=["%d","%f"])

[58.0670046  77.8361733  71.99262782 75.80547739 73.58093443 56.13041615
 63.38074222 70.33375836 66.6017805  60.58107339]
[[ 0.         58.0670046 ]
 [ 1.         77.8361733 ]
 [ 2.         71.99262782]
 [ 3.         75.80547739]
 [ 4.         73.58093443]
 [ 5.         56.13041615]
 [ 6.         63.38074222]
 [ 7.         70.33375836]
 [ 8.         66.6017805 ]
 [ 9.         60.58107339]]


# Other Experiments

In [None]:
feature_count = [30, 35, 45]
for fcount in feature_count:
  # sel = SelectFromModel(RandomForestRegressor(n_estimators=1000, min_samples_leaf=5, n_jobs=-1))
  sel = SelectFromModel(RandomForestRegressor(n_estimators=300, min_samples_leaf=5, n_jobs=-1, random_state=1), max_features=fcount) # back to 1000 estimators
  # sel = SelectFromModel(RandomForestRegressor(n_estimators=100, min_samples_leaf=5, n_jobs=-1), max_features=25)
  sel.fit(X, y)
  X_p = sel.transform(X)
  print("X shape %s" %str(X_p.shape))

  rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X_p, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest %d: %f" %(fcount, score))

# RandomForest 10: 0.473281
# RandomForest 20: 0.502336
# RandomForest 30: 0.514247
# RandomForest 35: 0.521251
# RandomForest 40: 0.523973
# RandomForest 45: 0.524660 <<<


# it seems 45 features overfits the training set, but generalizes not so well to the test set on the project server

# RandomForest 50: 0.521228
# RandomForest 55: 0.522183
# RandomForest 60: 0.522521
# RandomForest 65: 0.520342
# RandomForest 70: 0.519508
# RandomForest 80: 0.515016
# RandomForest 100 (only got 82): 0.514511

  self.estimator_.fit(X, y, **fit_params)


X shape (1104, 14)
RandomForest 30: 0.573641


  self.estimator_.fit(X, y, **fit_params)


X shape (1104, 14)
RandomForest 35: 0.573641


  self.estimator_.fit(X, y, **fit_params)


X shape (1104, 14)
RandomForest 45: 0.573641


In [None]:
feature_count = [40, 60, 80, 90, 100, 150]
for fcount in feature_count:
  # sel = SelectFromModel(RandomForestRegressor(n_estimators=1000, min_samples_leaf=5, n_jobs=-1))
  sel = SelectFromModel(RandomForestRegressor(n_estimators=300, min_samples_leaf=5, n_jobs=-1, random_state=1), max_features=fcount) # back to 1000 estimators
  # sel = SelectFromModel(RandomForestRegressor(n_estimators=100, min_samples_leaf=5, n_jobs=-1), max_features=25)
  sel.fit(X, y)
  X_p = sel.transform(X)
  print("X shape %s" %str(X_p.shape))

  sel = SelectFromModel(RandomForestRegressor(n_estimators=300, min_samples_leaf=5, n_jobs=-1, random_state=1), max_features=fcount//2) # back to 1000 estimators
  # sel = SelectFromModel(RandomForestRegressor(n_estimators=100, min_samples_leaf=5, n_jobs=-1), max_features=25)
  sel.fit(X_p, y)
  X_p = sel.transform(X_p)
  print("X shape %s" %str(X_p.shape))

  rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X_p, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest %d: %f" %(fcount, score))

# X shape (1212, 40)
# X shape (1212, 7)
# RandomForest 40: 0.454094
# X shape (1212, 60)
# X shape (1212, 11)
# RandomForest 60: 0.477203
# X shape (1212, 80)
# X shape (1212, 14)
# RandomForest 80: 0.487155
# X shape (1212, 82)
# X shape (1212, 14)
# RandomForest 90: 0.487155

  self.estimator_.fit(X, y, **fit_params)


X shape (1104, 14)


ValueError: ignored

In [None]:
'''
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=2, n_jobs=-1, random_state=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f'>>> r2_score: {r2_score(y_val, y_pred)} <<<')

clf = IsolationForest(max_samples=100, random_state=3)
clf.fit(X_train)
X_train_booleans = clf.predict(X_train)
print(np.asarray(X_train_booleans))

X_train_prime = []
y_train_prime = []
for i, b in enumerate(X_train_booleans):
  if b == 1:
    X_train_prime.append(X_train[i])
    y_train_prime.append(y_train[i])
X_train = np.asarray(X_train_prime)
y_train = np.asarray(y_train_prime)
print("X_train shape %s" %str(X_train.shape))

rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=2, n_jobs=-1, random_state=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f'>>> r2_score: {r2_score(y_val, y_pred)} <<<')
'''

# output
# >>> r2_score: 0.47233612984084306 <<<
# X_train shape (969, 87)
# X_train shape (884, 87)
# >>> r2_score: 0.40993444633687215 <<<

# conclusion: removing outliers from training set does not improve performance

In [None]:
# est = TheilSenRegressor()
# cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
# score = np.average(cval['test_score'])
# print("Est: %f" %score)
# Est: 0.319679

In [None]:
max_features = [5, 7, 9, 12, 14, 15, 16, 18, 20, 25, 30, 35, 40, 50, 70]
for n in max_features:
  rf = ExtraTreesRegressor(n_estimators=300, max_features=n, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("ExtraTreesRegressor max_features %d: %f" %(n, score))

In [None]:
max_features = [5, 7, 9, 12, 14, 15, 16, 18, 20, 25, 30, 35, 40, 50, 70]
for n in max_features:
  rf = RandomForestRegressor(n_estimators=300, max_features=n, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest max_features %d: %f" %(n, score))

In [None]:
max_samples = [50, 100, 200, 300, 500, 969]
for n in max_samples:
  rf = RandomForestRegressor(n_estimators=300, max_samples=n, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest max_samples %d: %f" %(n, score))

In [None]:
ntrees = [50, 100, 200, 300, 500, 1000, 2000]
for n in ntrees:
  rf = RandomForestRegressor(n_estimators=n, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest ntrees %d: %f" %(n, score))

In [None]:
max_depth = [2, 3, 4, 5, 7, 9, 13, 17, 25, 100]
for d in max_depth:
  rf = RandomForestRegressor(n_estimators=300, max_depth=d, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest max_depth %d: %f" %(d, score))

In [None]:
max_depth = [2, 3, 4, 5, 7, 9, 13, 17, 25, 50, 100]
for d in max_depth:
  rf = ExtraTreesRegressor(n_estimators=300, max_depth=d, min_samples_leaf=2, n_jobs=-1, random_state=2)
  cval = cross_validate(rf, X, y, scoring='r2', n_jobs=-1)
  score = np.average(cval['test_score'])
  print("RandomForest max_depth %d: %f" %(d, score))

In [None]:
def get_best():
  msl = [2, 3, 4]
  # msl = [5]
  scores = []
  for n in msl:
    print(f"min_samples_leaf={n}")
    # print("RandomForest")
    # est = RandomForestRegressor(n_estimators=100, min_samples_leaf=n, n_jobs=-1)
    # cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
    # score = np.average(cval['test_score'])
    # print(score)
    # scores.append(score)

    print("ExtraTrees")
    est = ExtraTreesRegressor(n_estimators=100, min_samples_leaf=n, n_jobs=-1)
    cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
    score = np.average(cval['test_score'])
    print(score)
    scores.append(score)

    # print("RandomForest")
    # est = RandomForestRegressor(n_estimators=1000, min_samples_leaf=n, n_jobs=-1)
    # cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
    # score = np.average(cval['test_score'])
    # print(score)
    # scores.append(score)

    # print("ExtraTrees")
    # est = ExtraTreesRegressor(n_estimators=1000, min_samples_leaf=n, n_jobs=-1)
    # cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
    # score = np.average(cval['test_score'])
    # print(score)
    # scores.append(score)

    # min_samples_leaf=5
    # ExtraTrees
    # 0.5167717523131301

  print(scores)
  ind = np.argmax(scores)
  print('min_samples_leaf: ' + str(msl[ind]))
  print('scores: ' + str(scores[ind]))

  clf = RandomForestRegressor(min_samples_leaf=msl[ind], n_jobs=-1)

  return clf

get_best()

In [None]:
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0, loss='squared_error')
cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
score = np.average(cval['test_score'])
print(score)
# # 0.45947468851701745

In [None]:
est = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=0 # 0.45931666632586976
)
cval = cross_validate(est, X, y, scoring='r2', n_jobs=-1)
score = np.average(cval['test_score'])
print(score)

## Model fitting

In [None]:
def fit_classifier(X_data, y_data):
  print("fit_classifier")
  # clf = Ridge(alpha=1000000.0)
  # clf = LinearRegression()
  # clf = LogisticRegression()
  # clf = RANSACRegressor(random_state=0)
  # clf = SVR(kernel="rbf", C=1.0, epsilon=0.2)
  # clf = make_pipeline(Normalizer(), SVR(C=1.0, epsilon=0.2)) # did worse than default SVR
  # clf = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2)) # did worse than default SVR
  # clf = make_pipeline(PolynomialFeatures(2), Ridge(alpha=1.0))
  # clf = Pipeline([
  #   ('feature_selection', SelectFromModel(LinearSVR())),
  #   ('classification', RandomForestClassifier())
  #   ])
  #clf = RandomForestClassifier(n_estimators=1000, random_state=1, max_features=8)
  clf = MLPRegressor(hidden_layer_sizes=(10,1000,1))
  # clf = ExtraTreesRegressor(n_estimators=1000, min_samples_leaf=5, n_jobs=-1)

  clf.fit(X_data, y_data.ravel())
  return clf

clf = fit_classifier(X_train, y_train)

## Model evaluation

In [None]:
def print_stats(y_val, y_val_pred):
  print(f'>>> r2_score: {r2_score(y_val, y_val_pred)} <<<')
  # print(f'mean_squared_error: {mean_squared_error(y_val, y_val_pred)}')
  # print(f'root_mean_squared_error: {mean_squared_error(y_val, y_val_pred, squared=False)}')
  # print(f'mean_absolute_error: {mean_absolute_error(y_val, y_val_pred)}')
  # print(f'median_absolute_error: {median_absolute_error(y_val, y_val_pred)}')

y_pred = clf.predict(X_val)
print_stats(y_val, y_pred)
# BaggingRegressor          0.2970000173416478
# AdaBoostRegressor         0.3079485558905629
# ExtraTreesRegressor       0.37519049014637373
# GradientBoostingRegressor 0.416384019256161 
# RandomForestRegressor     0.39808813525261155 
# XGBRegressor (default)    0.4148294673073579 
# MLPRegressor (10,1)     -49.35169434996628
  

## Generate test predictions

In [None]:
# et = ExtraTreesRegressor(n_estimators=100, max_features=20, min_samples_leaf=2, n_jobs=-1, random_state=2)
# cval = cross_validate(et, X, y, scoring='r2', n_jobs=-1)
# score = np.average(cval['test_score'])
# print("ExtraTrees: %f" %score)

# ExtraTrees w max_features=87: 0.529854
# ExtraTrees w max_features=20: 0.511661

clf = MLPRegressor(hidden_layer_sizes=(10,1,), activation = 'relu', random_state=2)
cval = cross_validate(clf, X, y, scoring='r2', n_jobs=-1)
score = np.average(cval['test_score'])
print(score)

In [None]:
from google.colab import runtime
runtime.unassign()