In [56]:
!wget https://www.openml.org/data/download/52617/tecator.arff
from scipy.io import arff
import pandas as pd
import numpy as np

arff_data = arff.loadarff('tecator.arff')
data = pd.DataFrame(arff_data[0])

total_components_count = 22
my_random_state = 0

--2022-12-17 17:43:19--  https://www.openml.org/data/download/52617/tecator.arff
Resolving www.openml.org (www.openml.org)... 131.155.11.11
Connecting to www.openml.org (www.openml.org)|131.155.11.11|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://api.openml.org/data/download/52617/tecator.arff [following]
--2022-12-17 17:43:20--  https://api.openml.org/data/download/52617/tecator.arff
Resolving api.openml.org (api.openml.org)... 131.155.11.11
Connecting to api.openml.org (api.openml.org)|131.155.11.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 250961 (245K) [text/plain]
Saving to: ‘tecator.arff.20’


2022-12-17 17:43:21 (630 KB/s) - ‘tecator.arff.20’ saved [250961/250961]



Drop the extrapolation data.

In [57]:
data = data[0:215]

Put the validation data aside (random_state is provided for reproducibility, and avoid contamination).

Training_data includes both training and testing (as they will be split depending on the model).

In [58]:

validation_data = data.sample(n=30, random_state=my_random_state)
training_data = data.drop(validation_data.index)


This function remembers the best model evaluated so far (according to its score on the testing set), so it  can be used in the end to assess its actual performance on the validation set.

In [59]:
from sklearn.model_selection import KFold
from itertools import combinations_with_replacement
from sklearn import clone
best_score = None
best_model = None
best_component_count = None


def evaluate_model(base_model, components_count=total_components_count, polynomial_degree=1):
  global best_score
  global best_model
  global best_component_count
  training_components = training_data[[f"principal_component_{i}" for i in range(1, components_count+1)]]
  training_fat = training_data['fat']
  scores_test = []
  scores_train = []
  if (polynomial_degree > 1):
    for degree in range(2, polynomial_degree+1):
      for combo in combinations_with_replacement(range(1, components_count+1), degree):
        training_components['_'.join([str(n) for n in combo])] = training_components[[f"principal_component_{i}" for i in combo]].prod(axis=1)
  cv = KFold(random_state=my_random_state, shuffle=True)
  for train_index, test_index in cv.split(training_data):
    X_train, X_test, y_train, y_test = training_components.iloc[train_index], training_components.iloc[test_index], training_fat.iloc[train_index], training_fat.iloc[test_index]
    model = clone(base_model)
    model.fit(X_train, y_train)
    scores_train.append(model.score(X_train, y_train))
    scores_test.append(model.score(X_test, y_test))
  if (best_score is None or (np.mean(scores_test) > best_score)):
    best_score = np.mean(scores_test)
    best_model = clone(base_model)
    best_component_count = components_count
    print(f'new best model: {best_model} using {best_component_count} components with a score of {best_score}')
  return (np.mean(scores_train), np.std(scores_train), np.mean(scores_test), np.std(scores_test))


Basic example and base line of use of evaluateModel.

In [60]:
def print_number(x):
  precision = 4
  return f'{x:.{precision}f}'


from sklearn import linear_model

for i in range(1, 23):
  score_train, std_train, score_test, std_test = evaluate_model(linear_model.LinearRegression(), i)
  print(f"A linear regression with the first {i} component(s) have a performance (R², std of R²) of {print_number(score_train)} (std: {print_number(std_train)}) on the training set, and {print_number(score_test)} (std: {print_number(std_test)}) on the testing set.")


new best model: LinearRegression() using 1 components with a score of 0.15450566811035588
A linear regression with the first 1 component(s) have a performance (R², std of R²) of 0.2011 (std: 0.0306) on the training set, and 0.1545 (std: 0.1422) on the testing set.
new best model: LinearRegression() using 2 components with a score of 0.16520972766812242
A linear regression with the first 2 component(s) have a performance (R², std of R²) of 0.2158 (std: 0.0277) on the training set, and 0.1652 (std: 0.1286) on the testing set.
new best model: LinearRegression() using 3 components with a score of 0.5253957163032359
A linear regression with the first 3 component(s) have a performance (R², std of R²) of 0.5695 (std: 0.0387) on the training set, and 0.5254 (std: 0.1636) on the testing set.
new best model: LinearRegression() using 4 components with a score of 0.875773756571318
A linear regression with the first 4 component(s) have a performance (R², std of R²) of 0.8913 (std: 0.0098) on the tr

In [61]:
print('Lasso', evaluate_model(linear_model.Lasso(), 22, polynomial_degree=2))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_components['_'.join([str(n) for n in combo])] = training_components[[f"principal_component_{i}" for i in combo]].prod(axis=1)


Lasso (0.9151572281094659, 0.004856124694133833, 0.8815243751767856, 0.025824393500973476)


In [62]:
from sklearn.neighbors import KNeighborsRegressor
print('KNeighborsRegressor', evaluate_model(KNeighborsRegressor(n_neighbors=5)))

KNeighborsRegressor (0.8772835111787781, 0.011777632223386672, 0.7961846899125573, 0.05304600307948361)


In [63]:
from sklearn.ensemble import RandomForestRegressor
print('RandomForestRegressor', evaluate_model(RandomForestRegressor(n_estimators=100)))

RandomForestRegressor (0.9765244991952834, 0.003371865259452384, 0.8021442237670341, 0.11047787593454221)


In [64]:
from sklearn.svm import SVR
print('SVR', evaluate_model(SVR()))

SVR (0.4322575404023068, 0.009660893957462835, 0.35008398823242787, 0.09109943450435609)


In [65]:
from sklearn.neural_network import MLPRegressor

def evaluate_NN(size, component_size = total_components_count):
  print(f'MLPRegressor with hidden layers of size {size} and {component_size} components: ', evaluate_model(MLPRegressor(hidden_layer_sizes=size, max_iter=100000, random_state=my_random_state), component_size))


This takes quite some time, so for demonstration purposes, I only take the winner (size = 2 & component_count = 14).

In [66]:
# for size in range(1, 8):
#   for component_count in range(3, min(23, 50 // size)):
#     evaluate_NN(size, component_count)
evaluate_NN(2, 14)

MLPRegressor with hidden layers of size 1 and 3 components:  (0.6083119922643515, 0.030394020947867327, 0.5568282822953341, 0.17130583528956908)
MLPRegressor with hidden layers of size 1 and 4 components:  (0.8927851404138696, 0.009310462583686237, 0.8773077275418102, 0.03992141702852902)
MLPRegressor with hidden layers of size 1 and 5 components:  (0.9331671802491848, 0.004359734920118528, 0.9215732149596582, 0.0204608459459247)
MLPRegressor with hidden layers of size 1 and 6 components:  (0.94809099171203, 0.003003802928733829, 0.9404693229077157, 0.013932831454565593)
MLPRegressor with hidden layers of size 1 and 7 components:  (0.9484949767144499, 0.0031403663853489076, 0.9404945251493257, 0.014696457474882994)
MLPRegressor with hidden layers of size 1 and 8 components:  (0.9509841339754164, 0.0026054653187758884, 0.9413289707134285, 0.011535471771624523)
MLPRegressor with hidden layers of size 1 and 9 components:  (0.9537605935354978, 0.0017470004368508002, 0.9449314567113397, 0.0

Final evaluation

In [67]:
best_model.fit(training_data[[f"principal_component_{i}" for i in range(1, best_component_count + 1)]], training_data['fat'])
print(best_model.score(validation_data[[f"principal_component_{i}" for i in range(1, best_component_count + 1)]], validation_data['fat']))

0.9907263091807177
