In [27]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import pickle as pkl

In [28]:
def hot1Encode(S):
    encoded = []
    dct_bases = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'C': [0, 0, 0, 1]}
    for b in S:
        encoded.extend(dct_bases[b])
    return encoded

In [29]:
dataset_file = '/home/jardic/Documents/projects/jk/datasets/datasets_prepped/strc_km.csv'
fractions = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3]

In [40]:
df = pd.read_csv(dataset_file, usecols=['varseq', 'cpm'])

for sampled_fraction in fractions:

    # Create a random split of defined size
    index = df.index.tolist()
    trn_idxs = np.random.choice(index, size=int(len(df) * sampled_fraction))
    index = list(set(index) - set(trn_idxs))
    tst_idxs = np.random.choice(index, size=int(len(df) * sampled_fraction))
    
    # Save the train and test indexes
    splits = {'trn':trn_idxs, 'tst':tst_idxs}
    
    # Use this to load splits, for reproducibility
    with open('splits_' + str(sampled_fraction).replace('.', '_') + '.pkl', mode='wb') as sf:
        pkl.dump(splits, sf)
    
    # Use splits indexes to make X_trn, y_trn, X_tst, y_tst
    df_trn = df.loc[splits['trn']]
    df_tst = df.loc[splits['tst']]
    
    X_trn = np.array([hot1Encode(s) for s in df_trn['varseq']])
    y_trn = df_trn['cpm']
    X_tst = np.array([hot1Encode(s) for s in df_tst['varseq']])
    y_tst = df_tst['cpm']
    
    # Train a simple MLP on the random split
    
    model = MLPRegressor(random_state=1,
             hidden_layer_sizes=(100, 100, 100, 100),
             activation='relu',
             batch_size=100,
             learning_rate_init=0.0001,
             early_stopping=True,
             validation_fraction=0.1,
            max_iter=40,
            learning_rate='constant',
            verbose=True)
    
    model.fit(X_trn, y_trn)
    
    predictions = model.predict(X_tst)

    with open('observed_and_predicted_' + str(sampled_fraction).replace('.', '_') + '.pkl', mode='wb') as rf:
        splits = pkl.dump([y_tst, predictions], rf)

Iteration 1, loss = 0.49570653
Validation score: 0.003485
Iteration 2, loss = 0.48435429
Validation score: 0.008464
Iteration 3, loss = 0.48006321
Validation score: 0.011853
Iteration 4, loss = 0.47566463
Validation score: 0.015684
Iteration 5, loss = 0.47104573
Validation score: 0.023304
Iteration 6, loss = 0.46671326
Validation score: 0.028094
Iteration 7, loss = 0.46178371
Validation score: 0.035437
Iteration 8, loss = 0.45550774
Validation score: 0.043795
Iteration 9, loss = 0.44684744
Validation score: 0.057007
Iteration 10, loss = 0.43724324
Validation score: 0.069693
Iteration 11, loss = 0.42703060
Validation score: 0.086755
Iteration 12, loss = 0.41473367
Validation score: 0.100367
Iteration 13, loss = 0.40067357
Validation score: 0.115057
Iteration 14, loss = 0.38964843
Validation score: 0.123076
Iteration 15, loss = 0.37474503
Validation score: 0.147261
Iteration 16, loss = 0.35846840
Validation score: 0.168709
Iteration 17, loss = 0.34583542
Validation score: 0.164895
Iterat



Iteration 1, loss = 0.37912979
Validation score: 0.002733
Iteration 2, loss = 0.37280742
Validation score: 0.007684
Iteration 3, loss = 0.36915419
Validation score: 0.013345
Iteration 4, loss = 0.36437432
Validation score: 0.020474
Iteration 5, loss = 0.35809190
Validation score: 0.028702
Iteration 6, loss = 0.35011949
Validation score: 0.039789
Iteration 7, loss = 0.34129806
Validation score: 0.050745
Iteration 8, loss = 0.33166157
Validation score: 0.050500
Iteration 9, loss = 0.32362403
Validation score: 0.060377
Iteration 10, loss = 0.31505722
Validation score: 0.059767
Iteration 11, loss = 0.30418880
Validation score: 0.075083
Iteration 12, loss = 0.29352258
Validation score: 0.079990
Iteration 13, loss = 0.28347046
Validation score: 0.064613
Iteration 14, loss = 0.27548956
Validation score: 0.076624
Iteration 15, loss = 0.25830989
Validation score: 0.053488
Iteration 16, loss = 0.24918626
Validation score: 0.084039
Iteration 17, loss = 0.23242302
Validation score: 0.060745
Iterat



Iteration 1, loss = 0.70161836
Validation score: 0.282722
Iteration 2, loss = 0.53369057
Validation score: 0.459235
Iteration 3, loss = 0.43219214
Validation score: 0.573674
Iteration 4, loss = 0.37531558
Validation score: 0.655672
Iteration 5, loss = 0.32394043
Validation score: 0.195845
Iteration 6, loss = 0.28079553
Validation score: 0.706330
Iteration 7, loss = 0.23878943
Validation score: 0.783459
Iteration 8, loss = 0.22151180
Validation score: 0.791190
Iteration 9, loss = 0.18448114
Validation score: 0.702182
Iteration 10, loss = 0.17858753
Validation score: 0.815262
Iteration 11, loss = 0.15433560
Validation score: 0.829771
Iteration 12, loss = 0.15433215
Validation score: 0.864607
Iteration 13, loss = 0.13074988
Validation score: 0.834412
Iteration 14, loss = 0.13286680
Validation score: 0.801182
Iteration 15, loss = 0.11344646
Validation score: 0.849172
Iteration 16, loss = 0.11069027
Validation score: 0.847407
Iteration 17, loss = 0.09717716
Validation score: 0.854562
Iterat

