In [1]:
from tpot import TPOTRegressor
from tpot.export_utils import generate_import_code, generate_export_pipeline_code
from tpot.export_utils import export_pipeline, expr_to_tree
from sklearn.model_selection import train_test_split, cross_val_score
# from tpot.config.classifier_nn import classifier_config_nn
from sklearn.pipeline import make_pipeline
from tpot.config import classifier_config_dict_light
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import numpy as np
import os
import glob
from ipywidgets import IntProgress

In [2]:
# set global variables
n_gen = 100
n_pop = 100
make_func = 'def opt_pipe(training_features, testing_features):\n'
import_impute = 'from sklearn.impute import SimpleImputer\n\n'
impute_text = '\timputer = SimpleImputer(strategy="median")\n\timputer.fit(training_features)\n\t\
training_features = imputer.transform(training_features)\n\t\
testing_features = imputer.transform(testing_features)\n'

def write_pipes(name, tpot):
    """Write TPOT pipelines out to subdirectories."""
    import_codes = generate_import_code(tpot._optimized_pipeline, tpot.operators)
    pipeline_codes = generate_export_pipeline_code(expr_to_tree(tpot._optimized_pipeline,tpot._pset), tpot.operators)
    pipe_text = import_codes.replace('import numpy as np\nimport pandas as pd\n', 'from sklearn.preprocessing import FunctionTransformer\nfrom copy import copy\n')
    if tpot._imputed: # add impute code when there is missing data
        pipe_text += import_impute + make_func + impute_text
    else:
        pipe_text += make_func
    pipe_text += '\n\texported_pipeline = ' + pipeline_codes + "\n\treturn({'train_feat': training_features, 'test_feat': testing_features, 'pipe': exported_pipeline})"
    f = open(name + '.py', 'w')
    f.write(pipe_text)
    f.close()

In [3]:
personal_config = classifier_config_dict_light

personal_config['sklearn.neural_network.MLPRegressor'] = {
    # MLPClassifier for neural networks
    # TODO: revisit/tweak: alpha, momentum, learning rate_init
    # separater paras based on activation
    'hidden_layer_sizes': [4],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
    'learning_rate_init': [1e-3, 1e-2, 1e-1, 0.5, 0.75, 0.9],
    'momentum': [0.1, 0.5, 0.75, 0.9]
}


In [4]:
scoring_function = 'r2'
random_state = 1618
path = ''
extension = 'csv'
data_dir = 'qsar/'
label = 'Act'
# os.chdir(path + data_dir)
# data_sets = [i.replace(".csv", "") for i in glob.glob('*.{}'.format(extension))]
# print(data_sets[0:10])
# os.chdir('..')


In [5]:
dat_name = 'METAB_training_preprocessed'

In [6]:
# for dat_name in data_sets:
dat_name = 'METAB_training_preprocessed'
accuracy_ls = []
tpot_data = pd.read_csv(data_dir + dat_name + '.csv', index_col = 'MOLECULE')
Xdata = tpot_data.loc[:, tpot_data.columns != label]
Ydata = tpot_data[label]
X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations = n_gen, 
                     population_size = n_pop, 
                     verbosity = 2,
                     config_dict = personal_config,
                     scoring = scoring_function,
                     random_state = random_state,
                     cv = TimeSeriesSplit(n_splits=5),
                     template = 'Selector-Transformer-MLPRegressor')

In [7]:
tpot.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=10100), HTML(value='')))

Generation 1 - Current best internal CV score: 0.5317005880583912
Generation 2 - Current best internal CV score: 0.5317005880583912
Generation 3 - Current best internal CV score: 0.5396567340418444
Generation 4 - Current best internal CV score: 0.5396567340418444
Generation 5 - Current best internal CV score: 0.5547953217564304
Generation 6 - Current best internal CV score: 0.5607768309159804
Generation 7 - Current best internal CV score: 0.5607768309159804
Generation 8 - Current best internal CV score: 0.5633701922593184
Generation 9 - Current best internal CV score: 0.571365683923249
Generation 10 - Current best internal CV score: 0.5725664255969718
Generation 11 - Current best internal CV score: 0.5734255565537448
Generation 12 - Current best internal CV score: 0.5734255565537448
Generation 13 - Current best internal CV score: 0.5734255565537448
Generation 14 - Current best internal CV score: 0.5734255565537448
Generation 15 - Current best internal CV score: 0.5734255565537448
Gener

TPOTRegressor(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT...00.0], 'learning_rate_init': [0.001, 0.01, 0.1, 0.5, 0.75, 0.9], 'momentum': [0.1, 0.5, 0.75, 0.9]}},
       crossover_rate=0.1,
       cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
       disable_update_check=False, early_stop=None, generations=100,
       max_eval_time_mins=5, max_time_mins=None, memory=None,
       mutation_rate=0.9, n_jobs=1, offspring_size=None,
       periodic_checkpoint_folder=None, population_size=100,
       random_state=1618, scoring='r2', subsample=1.0,
       template='Selector-Transformer-MLPRegressor', use_dask=False,
       verbosity=2, warm_start=False)

In [8]:
accuracy_ls.append([tpot._optimized_pipeline_score, tpot.score(X_test, y_test)])
# tpot.export('pipelines/' + dat_name + '_' + str(random_state) + '.py')
write_pipes('pipelines/' + dat_name + '_' + str(random_state), tpot)

accuracy_mat = pd.DataFrame(accuracy_ls, columns = ['Training CV Accuracy', 'Testing Accuracy'])
accuracy_mat.to_csv("accuracies/" + dat_name + '_' + str(random_state) + ".csv", sep = "\t")