# **AdaBoost**

All media | Median molecules log transformed

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import os
from joblib import dump
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from math import sqrt
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

## Data loading and transformation

In [None]:
data = pd.read_csv("./ALL_trainingdata.csv", sep='\t')

col = []
for column in data.columns:
    col.append(column)

target_col = col[2]
features = col[3:len(col)]

#scaler_x = MinMaxScaler(feature_range=(0,1))
#scaler_y = MinMaxScaler(feature_range=(0,1))

X = data[features].values
y = data[target_col].values
y = np.log1p(y)
y = np.reshape(y, (-1,1))

#X = scaler_x.fit_transform(X)
#y = scaler_y.fit_transform(y)

## Model configuration and training

In [None]:
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler, RobustScaler, Normalizer

base = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=0.1)),
    MaxAbsScaler(),
    StackingEstimator(estimator=RidgeCV()),
    Normalizer(norm="l2"),
    StackingEstimator(estimator=LinearSVR(C=0.5, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    MinMaxScaler(),    
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=SGDRegressor()),
    RobustScaler(),
    StackingEstimator(estimator=LinearSVR(C=15.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.75, tol=0.001)),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=1, min_child_weight=6, n_estimators=100, nthread=1, objective="reg:squarederror", subsample=0.6500000000000001)),
    MinMaxScaler(),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.2, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    MaxAbsScaler(),
    RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=1, min_samples_split=4, n_estimators=100)
)

parameters = {'test_size': 0.25,
              'base_estimator': base,
              'n_estimators': 100,            #default = 50
              'learning_rate': 0.3,          #default = 1.0
              'loss': 'linear',
              'random_state': 9             #default = None
             }

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=parameters['test_size'], random_state=9)

model = AdaBoostRegressor(base_estimator=parameters['base_estimator'],
                          n_estimators=parameters['n_estimators'],
                          learning_rate=parameters['learning_rate'],
                          loss=parameters['loss'],
                          random_state=parameters['random_state'])

In [None]:
model.fit(X_train, y_train.ravel())

## Model evaluation

In [None]:
#y_rescaled = scaler_y.inverse_transform(y_valid)
y_rescaled = y_valid

predict_valid = model.predict(X_valid)
predict_valid = np.reshape(predict_valid, (-1,1))
#predict_valid = scaler_y.inverse_transform(predict_valid)

baseline_preds = y_rescaled[:,target_col.index("Median molecules per cell")]
baseline_errors = abs(baseline_preds - y_rescaled)
errors = abs(predict_valid - y_rescaled)
mape = 100 * (errors / y_rescaled)
accuracy = 100 - np.mean(mape)

In [None]:
print("Average baseline error: ", round(np.mean(baseline_errors),2))
print("Mean absolute error: ", round(np.mean(errors),2))
print("Accuracy: ", round(accuracy, 2), "%", "\n")

print("Explained variance regression score: ", explained_variance_score(y_rescaled, predict_valid))
print("R2 score: ", r2_score(y_rescaled, predict_valid), "\n")

print("Maximum residual error: ", max_error(y_rescaled, predict_valid))
print("Median absolute error: ", median_absolute_error(y_rescaled, predict_valid))
print("Mean absolute error: ", mean_absolute_error(y_rescaled, predict_valid))
print("Mean squared error: ", mean_squared_error(y_rescaled, predict_valid))
print("Root mean squared error:", sqrt(mean_squared_error(y_rescaled, predict_valid)))
print("Mean squared logarithmic error: ", mean_squared_log_error(y_rescaled, predict_valid))

## Correlation between experimental data and predicted values

In [None]:
pearson = stats.pearsonr(y_rescaled.ravel(), predict_valid.ravel())
spearman = stats.spearmanr(y_rescaled.ravel(), predict_valid.ravel())

print('Pearson\'s r:', pearson[0], 'p-value:', pearson[1])
print('Spearman\'s r:', spearman[0], 'p-value:', spearman[1], '\n')

In [None]:
plot_data = pd.DataFrame()
plot_data['Known abundance'] = y_rescaled.ravel()
plot_data['Predicted abundance'] = predict_valid.ravel()

sns.regplot(x='Known abundance', y='Predicted abundance', data=plot_data)

## Predicted values

In [None]:
predict_valid = np.expm1(predict_valid)
y_rescaled = np.expm1(y_rescaled)

fmt = '%-8s%-20s%s'

print(fmt % ('', 'Eval data', 'Prediction'))
for i, (eval_row, pred_row) in enumerate(zip(y_rescaled, predict_valid)):
    print(fmt % (i, eval_row, pred_row))

## Model testing with ecYeast7

In [None]:
yeast7 = pd.read_csv("./testingdata.csv", sep='\t')

col_test = []
for column in yeast7.columns:
    col_test.append(column)

test_known = col_test[2]
test_features = col_test[3:len(col)]

#scaler_x_test = MinMaxScaler(feature_range=(0,1))
#scaler_y_test = MinMaxScaler(feature_range=(0,1))

X_test = yeast7[test_features].values
#X_test = yeast7[selected].values
y_test = yeast7[test_known].values
y_test = np.log1p(y_test)
y_test = np.reshape(y_test, (-1,1))

#X_test = scaler_x_test.fit_transform(X_test)
#y_test = scaler_y_test.fit_transform(y_test)

In [None]:
#test_rescaled = scaler_y_test.inverse_transform(y_test)
test_rescaled = y_test

predict_yeast7 = model.predict(X_test)
predict_yeast7 = np.reshape(predict_yeast7, (-1,1))
#predict_yeast7 = scaler_y.inverse_transform(predict_yeast7)

In [None]:
baseline_preds_test = test_rescaled[:,test_known.index("Median molecules per cell")]
baseline_errors_test = abs(baseline_preds_test - test_rescaled)
errors_test = abs(predict_yeast7 - test_rescaled)
mape_test = 100 * (errors_test / test_rescaled)
accuracy_test = 100 - np.mean(mape_test)

print("Average baseline error: ", round(np.mean(baseline_errors_test),2))
print("Mean absolute error: ", round(np.mean(errors_test),2))
print("Accuracy: ", round(accuracy_test, 2), "%", "\n")

print("Explained variance regression score: ", explained_variance_score(test_rescaled, predict_yeast7))
print("R2 score: ", r2_score(test_rescaled, predict_yeast7), '\n')

print("Maximum residual error: ", max_error(test_rescaled, predict_yeast7))
print("Median absolute error: ", median_absolute_error(test_rescaled, predict_yeast7))
print("Mean absolute error: ", mean_absolute_error(test_rescaled, predict_yeast7))
print("Mean squared error: ", mean_squared_error(test_rescaled, predict_yeast7))
print("Root mean squared error:", sqrt(mean_squared_error(test_rescaled, predict_yeast7)))
print("Mean squared logarithmic error: ", mean_squared_log_error(test_rescaled, predict_yeast7))

In [None]:
pearson = stats.pearsonr(y_rescaled.ravel(), predict_valid.ravel())
spearman = stats.spearmanr(y_rescaled.ravel(), predict_valid.ravel())

print('Pearson\'s r:', pearson[0], 'p-value:', pearson[1])
print('Spearman\'s r:', spearman[0], 'p-value:', spearman[1])

In [None]:
plot_data = pd.DataFrame()
plot_data['Known abundance'] = test_rescaled.ravel()
plot_data['Predicted abundance'] = predict_yeast7.ravel()

sns.regplot(x='Known abundance', y='Predicted abundance', data=plot_data)

In [None]:
predict_yeast7 = np.expm1(predict_yeast7)
test_rescaled = np.expm1(test_rescaled)

fmt = '%-8s%-20s%s'

print(fmt % ('', 'Known abundance', 'Prediction'))
for i, (eval_row, pred_row) in enumerate(zip(yeast7['Median molecules per cell'], predict_yeast7)):
    print(fmt % (i, eval_row, pred_row))

## ecYeast8 protein prediction

In [None]:
ecyeast8 = pd.read_csv("./ALL_predictiondata.csv", sep='\t')

ecy8_col_test = []
for column in ecyeast8.columns:
    ecy8_col_test.append(column)

ecy8_pred_unknown = ecy8_col_test[2]
ecy8_pred_features = ecy8_col_test[3:len(col)]

X_pred = ecyeast8[ecy8_pred_features].values
y_pred = ecyeast8[ecy8_pred_unknown].values
y_pred = np.log1p(y_pred)
y_pred = np.reshape(y_pred, (-1,1))

In [None]:
predict_ecyeast8 = model.predict(X_pred)
predict_ecyeast8 = np.reshape(predict_ecyeast8, (-1,1))

In [None]:
predict_ecyeast8 = np.expm1(predict_ecyeast8)

fmt = '%-8s%-20s'

print(fmt % ('', 'Prediction'))
for i, pred_row in enumerate(predict_ecyeast8):
    print(fmt % (i, pred_row))

In [None]:
prot_list = predict_ecyeast8.tolist()
output = open("pred_ecYeast8_ALL.txt", "w")
for prot in prot_list:
    output.write(str(prot)+'\n')
output.close()