In [1]:
from methods import *
import os, shutil

In [2]:
# We have percent_data/percentages/dataset/train_orig.txt

#load hyperparameters
sizes = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

folders_list =  ['percent_data/' + str(int(size*100)) for size in sizes]
#size_folders = ['size_data_f1/' + size for size in sizes]

#datasets
datasets = ['sst2', "trec"]

#number of output classes
num_classes_list = [2, 6]

#number of augmentations per original sentence
n_aug_list_dict = {'size_data_f1/1_tiny': [32, 32, 32, 32, 32], 
					'size_data_f1/2_small': [32, 32, 32, 32, 32],
					'size_data_f1/3_standard': [16, 16, 16, 16, 4],
					'size_data_f1/4_full': [16, 16, 16, 16, 4]}

if not os.path.isdir('size_data_f1'):
    os.mkdir('size_data_f1')
#number of words for input
input_size_list = [50, 25]


#word2vec dictionary
huge_word2vec = 'word2vec/glove.840B.300d.txt'
word2vec_len = 300

In [5]:
def run_cnn(train_file, test_file, num_classes, percent_dataset):

    #initialize model
    model = build_cnn(input_size, word2vec_len, num_classes)

    #load data
    train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
    test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

    #implement early stopping
    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

    #train model
    model.fit(	train_x, 
                train_y, 
                epochs=10, 
                callbacks=callbacks,
                validation_split=0.1, 
                batch_size=1024, 
                shuffle=True, 
                verbose=0)
    #model.save('checkpoints/lol')
    #model = load_model('checkpoints/lol')

    #evaluate model
    y_pred = model.predict(test_x)
    test_y_cat = one_hot_to_categorical(test_y)
    y_pred_cat = one_hot_to_categorical(y_pred)
    acc = accuracy_score(test_y_cat, y_pred_cat)

    #clean memory???
    train_x, train_y, test_x, test_y, model = None, None, None, None, None
    gc.collect()

    #return the accuracy
    #print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
    return acc

In [7]:
#for each method

writer_eda = open('outputs_percent/cnn_eda_'+ get_now_str() + '.txt', 'w')

#for storing the performances
performances_eda = {size_folder:[] for size_folder in folders_list}

writer_weda = open('outputs_percent/cnn_weda_tfidf_0_'+ get_now_str() + '.txt', 'w')

#for storing the performances
performances_weda = {size_folder:[] for size_folder in folders_list}

writer_noaug = open('outputs_percent/cnn_noaug_'+ get_now_str() + '.txt', 'w')

#for storing the performances
performances_noaug = {size_folder:[] for size_folder in folders_list}

#for each percentage dataset
for size_folder in folders_list:

    writer_weda.write(size_folder + '\n')
    writer_eda.write(size_folder + '\n')
    writer_noaug.write(size_folder + '\n')

    #get all six datasets
    dataset_folders = [size_folder + '/' + s for s in datasets]

    #for each dataset
    for i in range(len(dataset_folders)):
        for s in range(3): # trials
            #initialize all the variables
            dataset_folder = dataset_folders[i]
            dataset = datasets[i]
            num_classes = num_classes_list[i]
            input_size = input_size_list[i]
            word2vec_pickle = dataset_folder + '/word2vec.p'
            word2vec = load_pickle(word2vec_pickle)


            # WEDA
            train_path = dataset_folder + '/train_weda_pct_tfidf0.txt'
            test_path = dataset + '/test.txt'
            acc = run_cnn(train_path, test_path, num_classes, percent_dataset=1)
            print("cnn weda ", acc, dataset_folder)

            performances_weda[size_folder].append(acc)

            # EDA
            if dataset =="sst2":
                train_path = dataset_folder + '/train_eda_pct.txt' 
            else:
                train_path = dataset_folder + '/train_eda_pct_tfidf0.txt' 
                
            test_path = dataset + '/test.txt'
            acc = run_cnn(train_path, test_path, num_classes, percent_dataset=1)
            print("cnn eda ", acc, dataset_folder)

            performances_eda[size_folder].append(acc)

            # No aug
            train_path = dataset_folder + '/train_orig.txt'
            test_path = dataset + '/test.txt'
            acc = run_cnn(train_path, test_path, num_classes, percent_dataset=1)
            print("cnn noaug ", acc, dataset_folder)

            performances_noaug[size_folder].append(acc)


writer_weda.write(str(performances_weda) + '\n')
writer_eda.write(str(performances_eda) + '\n')
writer_noaug.write(str(performances_noaug) + '\n')

for size_folder in performances_weda:
    line = str(size_folder) + ' : ' + str(sum(performances_weda[size_folder])/len(performances_weda[size_folder]))
    writer_weda.write(line + '\n')
    print(line)
    
for size_folder in performances_eda:
    line = str(size_folder) + ' : ' + str(sum(performances_eda[size_folder])/len(performances_eda[size_folder]))
    writer_eda.write(line + '\n')
    print(line)

for size_folder in performances_noaug:
    line = str(size_folder) + ' : ' + str(sum(performances_noaug[size_folder])/len(performances_noaug[size_folder]))
    writer_noaug.write(line + '\n')
    print(line)

    
print(performances_weda)
print(performances_eda)
print(performances_noaug)

writer_weda.close()  
writer_eda.close()  
writer_noaug.close()  



cnn weda  0.6375071469411092 percent_data/1/sst2
cnn eda  0.577472841623785 percent_data/1/sst2
cnn noaug  0.5997712978845054 percent_data/1/sst2
cnn weda  0.6489422527158376 percent_data/1/sst2
cnn eda  0.6049170954831332 percent_data/1/sst2
cnn noaug  0.5706117781589479 percent_data/1/sst2
cnn weda  0.616923956546598 percent_data/1/sst2
cnn eda  0.6146369353916524 percent_data/1/sst2
cnn noaug  0.505431675242996 percent_data/1/sst2
cnn weda  0.348 percent_data/1/trec
cnn eda  0.272 percent_data/1/trec
cnn noaug  0.286 percent_data/1/trec
cnn weda  0.234 percent_data/1/trec
cnn eda  0.3 percent_data/1/trec
cnn noaug  0.18 percent_data/1/trec
cnn weda  0.404 percent_data/1/trec
cnn eda  0.364 percent_data/1/trec
cnn noaug  0.18 percent_data/1/trec
cnn weda  0.758147512864494 percent_data/5/sst2
cnn eda  0.7512864493996569 percent_data/5/sst2
cnn noaug  0.725557461406518 percent_data/5/sst2
cnn weda  0.7644368210405946 percent_data/5/sst2
cnn eda  0.7489994282447112 percent_data/5/sst2


cnn weda  0.918 percent_data/90/trec
cnn eda  0.924 percent_data/90/trec
cnn noaug  0.8 percent_data/90/trec
cnn weda  0.914 percent_data/90/trec
cnn eda  0.92 percent_data/90/trec
cnn noaug  0.792 percent_data/90/trec
cnn weda  0.92 percent_data/90/trec
cnn eda  0.926 percent_data/90/trec
cnn noaug  0.8 percent_data/90/trec
cnn weda  0.8433390508862207 percent_data/100/sst2
cnn eda  0.8450543167524299 percent_data/100/sst2
cnn noaug  0.8370497427101201 percent_data/100/sst2
cnn weda  0.8473413379073756 percent_data/100/sst2
cnn eda  0.8399085191538022 percent_data/100/sst2
cnn noaug  0.8319039451114922 percent_data/100/sst2
cnn weda  0.8330474556889651 percent_data/100/sst2
cnn eda  0.8376214979988564 percent_data/100/sst2
cnn noaug  0.8387650085763293 percent_data/100/sst2
cnn weda  0.914 percent_data/100/trec
cnn eda  0.912 percent_data/100/trec
cnn noaug  0.792 percent_data/100/trec
cnn weda  0.924 percent_data/100/trec
cnn eda  0.93 percent_data/100/trec
cnn noaug  0.764 percent_d