# **H2O AutoML Regression**

YPD medium | Median molecules log transformed

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
from math import sqrt
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()

## Data loading and transformation

In [None]:
data = pd.read_csv("./YPD_trainingdata.csv", sep='\t')
data['Median molecules per cell'] = np.log1p(data['Median molecules per cell'])

In [None]:
data = h2o.H2OFrame(data)

In [None]:
no_id = data.drop(['Protein(Ensembl)', 'Standard Name'])

x = no_id.columns
y = 'Median molecules per cell'
x.remove(y)

split = no_id.split_frame(ratios = [0.75], seed = 9)
train = split[0]
valid = split[1]

## Model configuration and training

In [None]:
model = H2OAutoML(max_runtime_secs = 21600,
                  max_models = None,
                  nfolds = 10,
                  #seed = 9,
                  project_name = "H2O_AutoML_Regression")

In [None]:
model.train(x = x, y = y, training_frame = train, leaderboard_frame = valid)

In [None]:
model.leaderboard

## Model evaluation

In [None]:
y_valid = valid[y]
y_valid = y_valid.as_data_frame()
y_valid = np.array(y_valid)

predict_valid = model.predict(valid)
predict_valid = predict_valid.as_data_frame()
predict_valid = np.array(predict_valid)

baseline_preds = y_valid[:,y.index("Median molecules per cell")]
baseline_errors = abs(baseline_preds - y_valid)
errors = abs(predict_valid - y_valid)
mape = 100 * (errors / y_valid)
accuracy = 100 - np.mean(mape)

In [None]:
print(model.leader.model_performance(valid))
print("----------", '\n')

print("Average baseline error: ", round(np.mean(baseline_errors),2))
print("Mean absolute error: ", round(np.mean(errors),2))
print("Accuracy: ", round(accuracy, 2), "%", "\n")

print("Explained variance regression score: ", explained_variance_score(y_valid, predict_valid))
print("R2 score: ", r2_score(y_valid, predict_valid), '\n')

print("Maximum residual error: ", max_error(y_valid, predict_valid))
print("Median absolute error: ", median_absolute_error(y_valid, predict_valid))
print("Mean absolute error: ", mean_absolute_error(y_valid, predict_valid))
print("Mean squared error: ", mean_squared_error(y_valid, predict_valid))
print("Root mean squared error:", sqrt(mean_squared_error(y_valid, predict_valid)))
print("Mean squared logarithmic error: ", mean_squared_log_error(y_valid, predict_valid))

## Correlation between experimental data and predicted values

In [None]:
predict_valid = np.expm1(predict_valid)
y_valid = np.expm1(y_valid)

pearson = stats.pearsonr(y_valid.ravel(), predict_valid.ravel())
spearman = stats.spearmanr(y_valid.ravel(), predict_valid.ravel())

print('Pearson\'s r:', pearson[0], 'p-value:', pearson[1])
print('Spearman\'s r:', spearman[0], 'p-value:', spearman[1], '\n')

In [None]:
plot_data = pd.DataFrame()
plot_data['Known abundance'] = y_valid.ravel()
plot_data['Predicted abundance'] = predict_valid.ravel()

sns.regplot(x='Known abundance', y='Predicted abundance', data=plot_data)

## Predicted values

In [None]:
fmt = '%-8s%-20s%s'

print(fmt % ('', 'Eval data', 'Prediction'))
for i, (eval_row, pred_row) in enumerate(zip(y_valid, predict_valid)):
    print(fmt % (i, eval_row, pred_row))

## ecYeast8 protein prediction

In [None]:
yeast8 = pd.read_csv("./YPD_predictiondata.csv", sep='\t')
yeast8['Median molecules per cell'] = np.log1p(yeast8['Median molecules per cell'])

In [None]:
yeast8 = h2o.H2OFrame(yeast8)

In [None]:
yeast8_no_id = yeast8.drop(['Protein(Ensembl)', 'Standard Name'])

x = yeast8_no_id.columns
y = 'Median molecules per cell'
x.remove(y)

test = yeast8_no_id

In [None]:
y_test = test[y]
y_test = y_test.as_data_frame()
y_test = np.array(y_test)

predict_valid = model.predict(test)
predict_valid = predict_valid.as_data_frame()
predict_valid = np.array(predict_valid)

baseline_preds = y_test[:,y.index("Median molecules per cell")]
baseline_errors = abs(baseline_preds - y_test)
errors = abs(predict_valid - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)

In [None]:
print(model.leader.model_performance(test))
print("----------", '\n')

print("Average baseline error: ", round(np.mean(baseline_errors),2))
print("Mean absolute error: ", round(np.mean(errors),2))
print("Accuracy: ", round(accuracy, 2), "%", "\n")

print("Explained variance regression score: ", explained_variance_score(y_test, predict_valid))
print("R2 score: ", r2_score(y_test, predict_valid), '\n')

print("Maximum residual error: ", max_error(y_test, predict_valid))
print("Median absolute error: ", median_absolute_error(y_test, predict_valid))
print("Mean absolute error: ", mean_absolute_error(y_test, predict_valid))
print("Mean squared error: ", mean_squared_error(y_test, predict_valid))
print("Root mean squared error:", sqrt(mean_squared_error(y_test, predict_valid)))
print("Mean squared logarithmic error: ", mean_squared_log_error(y_test, predict_valid))

## Correlation between experimental data and predicted values

In [None]:
y_test = np.expm1(y_test)
predict_valid = np.expm1(predict_valid)


pearson = stats.pearsonr(y_test.ravel(), predict_valid.ravel())
spearman = stats.spearmanr(y_test.ravel(), predict_valid.ravel())

print('Pearson\'s r:', pearson[0], 'p-value:', pearson[1])
print('Spearman\'s r:', spearman[0], 'p-value:', spearman[1], '\n')

In [None]:
plot_data = pd.DataFrame()
plot_data['Known abundance'] = y_test.ravel()
plot_data['Predicted abundance'] = predict_valid.ravel()

sns.regplot(x='Known abundance', y='Predicted abundance', data=plot_data)

## Predicted values

In [None]:
fmt = '%-8s%-20s%s'

print(fmt % ('', 'Eval data', 'Prediction'))
for i, (eval_row, pred_row) in enumerate(zip(y_test, predict_valid)):
    print(fmt % (i, eval_row, pred_row))