In [11]:
# Some imports, we are not gong to use all the imports in this workbook but in subsequent workbooks we surely will.
import os
import time

import numpy as np
import pandas as pd

from toolz import pipe as p
import tensorflow as tf


In [12]:
import model_runs
import models as models
import utils as u

from importlib import reload
reload(u)
reload(model_runs)
reload(models)

<module 'models' from 'C:\\Users\\jod204\\projects\\jo\\samplesize_test\\models.py'>

# Settings and Helpers

In [26]:
# Define some Global Variables
max_features = 20000 # Maximum Number of words we want to include in our dictionary
maxlen = 400 # No of words in question we want to create a sequence with
embed_size = 50# Size of word to vec embedding we are using


def get_log_dir(iteration, samples_per_class):
    return f"output/{samples_per_class}/{iteration}/"


def save_series_gen(log_dir, prefix):
    def save_series(series, name):
        series.to_csv(f'{log_dir}/{prefix}_{name}.csv', index = False, header = False)
    
    return save_series


def log_truths(log_dir, train_y, val_y, test_y):
    save_series = save_series_gen(log_dir, 'true')
        
    save_series(train_y, 'train')
    save_series(val_y, 'val')
    save_series(test_y, 'test')


def log_iteration_preds(log_dir, model_name,
                  pred_train_y, pred_val_y, pred_test_y,
                  one_train_time):
    def save_arr(arr, name):
        arr.tofile(f'{log_dir}/{model_name}_{name}.csv', sep = ',')
    
    save_arr(pred_train_y, 'pred_train')
    save_arr(pred_val_y, 'pred_val')
    save_arr(pred_test_y, 'pred_test')
    p(one_train_time,
      np.array,
      lambda _: save_arr(_, 'time'))


def run_and_log_model(model, model_name, train_X, train_y, val_X, val_y, test_X, iteration, samples_per_class):
    start_time = time.perf_counter()
    pred_train_y, pred_val_y, pred_test_y = model_runs.train_pred(model, train_X, train_y,
                                          val_X, val_y, test_X, epochs=50)
    end_time = time.perf_counter()
    one_train_time = end_time - start_time
    
    dir_name = get_log_dir(iteration, samples_per_class)
    log_iteration_preds(dir_name, model_name, 
                  pred_train_y, pred_val_y, pred_test_y,
                  one_train_time) 


# Model Execution

In [None]:
if __name__ == '__main__':
  with tf.device('/device:GPU:0'):
      data = pd.read_csv('./content/pubmed_cr_hep_ctl_abstracts_clean.csv')

      for samples_per_class in range(500, 2500, 500):
          for i in range(0, 1):
              train_X, val_X, test_X, train_y, val_y, test_y, word_index = u.load_and_prec(
                  samples_per_class, max_features, maxlen = maxlen, data = data)
            
              log_dir = get_log_dir(i, samples_per_class)
              os.makedirs(log_dir, exist_ok = True)
            
              log_truths(log_dir, train_y, val_y, test_y)
              
              print(test_X.shape[1])
              sequence_len = test_X.shape[1]
                
              def run_and_log_model_curry(model, model_name):
                run_and_log_model(model, model_name, 
                                  train_X, train_y, val_X, val_y, test_X, 
                                  i, samples_per_class)

              # CNN
              cnn_model_built = models.cnn_model(sequence_len, [3, 4, 5], maxlen, max_features, embed_size, 
                                                 num_filters=200, drop_rate=0.2)
              cnn_model_built.summary()
              run_and_log_model_curry(cnn_model_built, 'cnn')

              # LSTM
              lstm_model = models.model_lstm_du(maxlen, max_features, embed_size)
              lstm_model.summary()
              run_and_log_model_curry(lstm_model, 'lstm')

              # HAN
              han_model = models.model_lstm_atten(maxlen, max_features, embed_size)
              han_model.summary()
              run_and_log_model_curry(han_model, 'han')


1000
500
500
X shape :  (1000,)
y shape :  (1000,)
400
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 400, 50)      1000000     input_2[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 400, 50, 1)   0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 398, 1, 200)  30200       reshape_3[0][0]                  
______________________________________________________

Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50

Epoch 00043: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 44/50

Epoch 00044: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 45/50

Epoch 00045: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 400, 50)      1000000     input_3[0][0]                    
__________________________________________________________________________________

Train on 603 samples, validate on 67 samples
Epoch 1/50
Epoch 2/50

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 3/50

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 4/50

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 5/50

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 6/50
Epoch 00006: early stopping
2000
1000
1000
X shape :  (2000,)
y shape :  (2000,)
400
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 400, 50)      1000000     input_5[0][0]                    


Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50

Epoch 00040: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 41/50

Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 42/50

Epoch 00042: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Epoch 00050: ReduceLROnPlateau reducing learning rate to 0.0001.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embeddi


Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 15/50
Epoch 00015: early stopping
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 400)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 400, 50)           1000000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 400, 256)          183296    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 400, 128)          164352    
_________________________________________________________________
attention_with_context_2 (At (None, 128)               16640     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
____________________________________

Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 29/50
Epoch 30/50
Epoch 31/50

Epoch 00031: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 32/50

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 33/50
Epoch 34/50
Epoch 35/50

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Par

Train on 1809 samples, validate on 201 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 11/50

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 13/50
Epoch 00013: early stopping
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 400)               0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 400, 50)           1000000   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 400, 256)          183296    
_______