In [7]:
#user inputs

#load hyperparameters
sizes = ['4_full']#['1_tiny', '2_small', '3_standard', '4_full']
size_folders = ['size_data_t1/' + size for size in sizes]

#datasets
datasets = ['sst2']

#number of output classes
num_classes_list = [2, 2, 2, 6, 2]

#number of augmentations per original sentence
n_aug_list_dict = {'size_data_t1/1_tiny': [32, 32, 32, 32, 32], 
					'size_data_t1/2_small': [32, 32, 32, 32, 32],
					'size_data_t1/3_standard': [16, 16, 16, 16, 4],
					'size_data_t1/4_full': [16, 16, 16, 16, 4]}

#number of words for input
input_size_list = [50, 50, 40, 25, 25]

#word2vec dictionary
huge_word2vec = 'word2vec/glove.840B.300d.txt'
word2vec_len = 300

In [9]:
from methods import *


for size_folder in size_folders:

    dataset_folders = [size_folder + '/' + s for s in datasets]
    n_aug_list = n_aug_list_dict[size_folder]

    #for each dataset
    for i, dataset_folder in enumerate(dataset_folders):

        n_aug = n_aug_list[i]

        #pre-existing file locations
        train_orig = 'sst2/train_orig.txt'#dataset_folder + '/train_orig.txt'

        #file to be created
        train_aug_st = 'sst2/train_aug_st.txt'#dataset_folder + '/train_aug_st.txt'

        #standard augmentation
        gen_standard_aug(train_orig, train_aug_st, n_aug)

        #generate the vocab dictionary
        word2vec_pickle = "sst2" + '/word2vec.p'
        gen_vocab_dicts("sst2", word2vec_pickle, huge_word2vec)

finished eda for sst2/train_orig.txt to sst2/train_aug_st.txt
['sst2/test.txt', 'sst2/train_aug_st.txt', 'sst2/train_orig.txt']
33717 unique words found
24731 matches between unique words and word2vec dictionary
dictionaries outputted to sst2/word2vec.p


In [17]:
from numpy.random import seed
seed(0)

###############################
#### run model and get acc ####
###############################

def run_cnn(train_file, test_file, num_classes, input_size, percent_dataset, word2vec):

    #initialize model
    model = build_cnn(input_size, word2vec_len, num_classes)

    #load data
    train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
    test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

    #implement early stopping
    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

    #train model
    model.fit(	train_x, 
                train_y, 
                epochs=3, 
                callbacks=callbacks,
                validation_split=0.1, 
                batch_size=1024, 
                shuffle=True, 
                verbose=1)
    #model.save('checkpoints/lol')
    #model = load_model('checkpoints/lol')

    #evaluate model
    y_pred = model.predict(test_x)
    test_y_cat = one_hot_to_categorical(test_y)
    y_pred_cat = one_hot_to_categorical(y_pred)
    acc = accuracy_score(test_y_cat, y_pred_cat)

    #clean memory???
    train_x, train_y, model = None, None, None
    gc.collect()

    #return the accuracy
    #print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
    return acc

###############################
### get baseline accuracies ###
###############################

def compute_baselines_aug(writer):

    #baseline computation
    for size_folder in size_folders:

        #get all six datasets
        dataset_folders = [size_folder + '/' + s for s in datasets]
        performances = []

        #for each dataset
        for i in range(len(dataset_folders)):

            #initialize all the variables
            dataset_folder = dataset_folders[i]
            dataset = datasets[i]
            num_classes = num_classes_list[i]
            input_size = input_size_list[i]
            word2vec_pickle =  'sst2/word2vec.p'
            word2vec = load_pickle(word2vec_pickle)

            train_path = 'sst2/train_aug_st.txt'
            test_path =  dataset + '/test.txt'
            acc = run_cnn(train_path, test_path, num_classes, input_size, 1, word2vec)
            performances.append(str(acc))

        line = ','.join(performances)
        print(line)
        writer.write(line+'\n')

###############################
############ main #############
###############################



writer = open('baseline_aug_cnn/' + get_now_str() + '.csv', 'w')

for i in range(0, 10):

    seed(i)
    print(i)
    compute_baselines_aug(writer)

0
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8359062321326473
1
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8404802744425386
2
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8073184676958262
3
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8359062321326473
4
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8347627215551744
5
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.826186392224128
6
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8324757004002287
7
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8273299028016009
8
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8353344768439108
9
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.8370497427101201


In [None]:
def compute_baselines_orig(writer):

    #baseline computation
    for size_folder in size_folders:

        #get all six datasets
        dataset_folders = [size_folder + '/' + s for s in datasets]
        performances = []

        #for each dataset
        for i in range(len(dataset_folders)):

            #initialize all the variables
            dataset_folder = dataset_folders[i]
            dataset = datasets[i]
            num_classes = num_classes_list[i]
            input_size = input_size_list[i]
            word2vec_pickle =  'sst2/word2vec.p'
            word2vec = load_pickle(word2vec_pickle)

            train_path = 'sst2/train_orig_st.txt'
            test_path =  dataset + '/test.txt'
            acc = run_cnn(train_path, test_path, num_classes, input_size, 1, word2vec)
            performances.append(str(acc))

        line = ','.join(performances)
        print(line)
        writer.write(line+'\n')

###############################
############ main #############
###############################



writer = open('baseline_orig_cnn/' + get_now_str() + '.csv', 'w')

for i in range(0, 10):

    seed(i)
    print(i)
    compute_baselines_orig(writer)