In [1]:
import numpy as np
import pandas as pd
from tpot import TPOTRegressor
import importlib
from utils import *

In [2]:
data_dir = 'qsar/'
label = 'Act'
random_state = 1618
my_datasets = pd.read_csv('qsar_datasets.txt', header = None)[0].values.tolist()

In [3]:
def get_predictions(dat_name, random_state):
    file_name = dat_name + '_'  + str(random_state)
    # Read in the data:
    train_data = pd.read_csv(data_dir + dat_name + '_training_preprocessed.csv', index_col = 'MOLECULE')
    test_data = pd.read_csv(data_dir + dat_name + '_test_preprocessed.csv', index_col = 'MOLECULE')
    testing_features = test_data.drop([label], axis=1)
    training_features = train_data.drop([label], axis=1)
    training_target = train_data[label]
    testing_target = test_data[label]

    # Run the TPOT recommended pipeline
    pipe_file = 'pipelines.' + dat_name + '_' + str(random_state)
    
    pipe = importlib.import_module(pipe_file, package=None)

    tpot_results = pipe.opt_pipe(training_features, testing_features)
    exported_pipeline = tpot_results['pipe']
    training_features = tpot_results['train_feat'] # in case imputation was done
    testing_features = tpot_results['test_feat'] # in case imputation was done

    tpot_obj= TPOTRegressor()
    
    # Set random_state:
    if hasattr(exported_pipeline, 'steps'): # if it is a pipeline
        tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', random_state)
    elif hasattr(exported_pipeline, 'random_state'): # if this is a classifier
        exported_pipeline.set_params(**random_state)

    # Fit the exported pipeline to the training:
    model = exported_pipeline.fit(training_features, training_target)
    
    # Get predictions:
    pred = exported_pipeline.predict(testing_features)
    predictions = pd.DataFrame({'MOLECULE': list(test_data.index), 'Act_pred': pred, 'Act': testing_target})
    predictions.to_csv('predictions/' + file_name + '_test.csv')
    
    return({'Dataset': dat_name, 'Testing R^2': np.corrcoef(pred, testing_target)[0,1]})


In [4]:
my_datasets = ['METAB', 'HIVINT']
mtypes = {'datasets': my_datasets,
          'seed': [random_state]}
mtype_grid = expand_grid(mtypes) # data type grid

results = mtype_grid.apply(
    lambda r: get_predictions(r.datasets, r.seed), 
    axis = 1, result_type = 'expand')


  f = msb / msw


In [5]:
results

Unnamed: 0,Dataset,Testing R^2
0,METAB,0.69645
1,HIVINT,0.457931


In [9]:
# final_results = pd.concat([mtype_grid, results], axis = 1)
results.to_csv('MLP_results/' + str(random_state) + ".csv")