In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor

In [2]:
def encodePair(s_parent, s_mutant):
    '''
    Encodes the a single pair of sequences into a numpy array
    '''
    
    pair_encoded = []
    blocks = {'A' : np.array([1, 0, 0, 0]), 'T' : np.array([0, 1, 0, 0]), 'G' : np.array([0, 0, 1, 0]), 'C' : np.array([0, 0, 0, 1])}
    for b_parent, b_mutant in zip(s_parent, s_mutant):
        pair_encoded.append(blocks[b_parent])
        if b_mutant == b_parent:
            pair_encoded.append(np.array([0, 0, 0, 0]))
        else:
            pair_encoded.append(blocks[b_mutant])
    return np.concatenate(pair_encoded)

In [3]:
# Make a grid of MLP parameters

fractions = [0.5, 0.33, 0.1, 0.01]
architectures = [(100), (100, 100), (100, 100, 100), (50), (50, 50), (50, 50, 50)]
activations = ['relu','logistic']

grid = []
for arch in architectures:
    for act in activations:
        grid.append([arch, act])

df_grid = pd.DataFrame(grid, columns=['arch', 'act'])
df_grid.to_csv('hyperparams_grid.csv')

In [10]:
# Train and predict for each of the fractions and parameter combinations

for f in fractions:
    
    train_data_file = 'splits/local_train_' + str(f).replace('.', '_') + '.csv'
    val_data_file = 'splits/local_val_' + str(f).replace('.', '_') + '.csv'
    test_data_file = 'splits/local_test_' + str(f).replace('.', '_') + '.csv'

    df_train = pd.read_csv(train_data_file, index_col=0)
    df_val = pd.read_csv(val_data_file, index_col=0)
    df_test = pd.read_csv(test_data_file, index_col=0)

    train_features = [encodePair(sb, sw) for sb, sw in zip(df_train['seq_b'], df_train['seq_w'])]
    train_labels = [dC for dC in df_train['deltaC']]
    val_features = [encodePair(sb, sw) for sb, sw in zip(df_val['seq_b'], df_val['seq_w'])]
    val_labels = [dC for dC in df_val['deltaC']]
    
    for index, row in df_grid.iterrows():
        
        results_file = 'results/local_split_' + str(f).replace('.', '') + '_' + str(index) + '.csv'

        model = MLPRegressor(random_state=1, max_iter=500, hidden_layer_sizes=row['arch'], activation=row['act']).fit(train_features, train_labels)
        pred = model.predict(val_features)

        df_res = pd.DataFrame({'obs':val_labels, 'pred':pred})
        df_res.index = df_val.index
        df_res.to_csv(results_file)