In [1]:
# Code taken and adapted from : https://github.com/jasonwei20/eda_nlp/tree/master/experiments

In [1]:
from methods import *

In [3]:
## config file
datasets = ['sst2']
dataset_folders = ['increment_datasets_f2/' + dataset for dataset in datasets] 
num_classes_list = [2]
input_size_list = [50]
huge_word2vec = 'word2vec/glove.840B.300d.txt'
word2vec_len = 300
increments = [0.7, 0.8, 0.9, 1]

In [5]:
#pre-existing file locations
dataset_folder = "sst2"

train_orig = dataset_folder + '/train_orig.txt'

#file to be created
train_aug_st = dataset_folder + '/train_aug_st.txt'

#standard augmentation
gen_standard_aug(train_orig, train_aug_st)

#generate the vocab dictionary
word2vec_pickle = dataset_folder + '/word2vec.p' # don't want to load the huge pickle every time, so just save the words that are actually used into a smaller dictionary
gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)

finished eda for sst2/train_orig.txt to sst2/train_aug_st.txt
['sst2/test.txt', 'sst2/train_aug_st.txt', 'sst2/train_orig.txt']
30289 unique words found
23189 matches between unique words and word2vec dictionary
dictionaries outputted to sst2/word2vec.p


In [9]:
###############################
#### run model and get acc ####
###############################

def run_model(train_file, test_file, num_classes, percent_dataset):

    #initialize model
    model = build_model(input_size, word2vec_len, num_classes)

    #load data
    train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
    test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

    #implement early stopping
    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
    print(train_x[:3])
    print(train_y[:3])
    #train model
    model.fit(	train_x, 
                train_y, 
                epochs=20, # TODO they had 100000
                callbacks=callbacks,
                validation_split=0.1, 
                batch_size=1024, 
                shuffle=True, 
                verbose=3)
    #model.save('checkpoints/lol')
    #model = load_model('checkpoints/lol')

    #evaluate model
    y_pred = model.predict(test_x)
    print(y_pred)
    print(test_y)
    test_y_cat = one_hot_to_categorical(test_y)
    y_pred_cat = one_hot_to_categorical(y_pred)
    acc = accuracy_score(test_y_cat, y_pred_cat)

    #clean memory???
    train_x, train_y = None, None
    gc.collect()

    #return the accuracy
    #print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
    return acc

In [11]:
#get the accuracy at each increment

orig_accs = {dataset:{} for dataset in datasets}
aug_accs = {dataset:{} for dataset in datasets}

writer = open('outputs_f2/' + get_now_str() + '.csv', 'w')

#for each dataset
for i, dataset_folder in enumerate(dataset_folders):

    dataset = datasets[i]
    num_classes = num_classes_list[i]
    input_size = input_size_list[i]
    train_orig = 'sst/train_orig.txt' #dataset_folder + '/train_orig.txt'
    train_aug_st = 'sst/train_aug_st.txt' #dataset_folder + '/train_aug_st.txt'
    test_path = 'sst/test.txt'#dataset_folder + '/test.txt'
    word2vec_pickle = 'sst/word2vec.p' ## dataset_folder + '/word2vec.p' TODO: check this
    word2vec = load_pickle(word2vec_pickle)

    #for increment in increments:
    increment = 0.7
    #calculate augmented accuracy
    aug_acc = run_model(train_aug_st, test_path, num_classes, increment)
    aug_accs[dataset][increment] = aug_acc

    #calculate original accuracy
    orig_acc = run_model(train_orig, test_path, num_classes, increment)
    orig_accs[dataset][increment] = orig_acc

    print(dataset, increment, orig_acc, aug_acc)
    writer.write(dataset + ',' + str(increment) + ',' + str(orig_acc) + ',' + str(aug_acc) + '\n')

    gc.collect()

print(orig_accs, aug_accs)

[[[ 0.12671    -0.21656001 -0.025641   ...  0.52364999 -0.037669
   -0.43900999]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.014719    0.38536    -0.066643   ...  1.43869996 -0.38578999
    0.38552999]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.089187    0.25792     0.26282001 ...  0.14421    -0.169
    0.26501   ]
  [-0.087595    0.35501999  0.063868   ...  0.03446    -0.15027
    0.40673   ]
  [-0.24931     0.46448001 -0.31274    ... -0.12095     0.03795
    0.0020277 ]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.18732999  0.40595001 -0.51174003 ...  0.16495     0

In [8]:
print(y_pred)

NameError: name 'y_pred' is not defined

# Custom test

In [None]:
def get_x_and_y(file, num_classes, word2vec_len, input_size, word2vec):
    train_lines = open(file, 'r').readlines()
    shuffle(train_lines)
    train_lines = train_lines
    num_lines = len(train_lines)

    #initialize x and y matrix
    x_matrix = None
    y_matrix = None

    try:
        x_matrix = np.zeros((num_lines, input_size, word2vec_len))
    except:
        print("Error!", num_lines, input_size, word2vec_len)
    y_matrix = np.zeros((num_lines, num_classes))

    #insert values
    for i, line in enumerate(train_lines):

        parts = line[:-1].split('\t')
        label = int(parts[0])
        sentence = parts[1]	
        #insert x
        words = sentence.split(' ')
        words = words[:x_matrix.shape[1]] #cut off if too long
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i, j, :] = word2vec[word]
        #insert y
        y_matrix[i][label] = 1.0

    return x_matrix, y_matrix

In [None]:
def run_model(train_file, test_file, num_classes, percent_dataset):

    #initialize model
    model = build_model(input_size, word2vec_len, num_classes)

    #load data
    train_x, train_y = get_x_and_y(train_file, num_classes, word2vec_len, input_size, word2vec)
    test_x, test_y = get_x_and_y(test_file, num_classes, word2vec_len, input_size, word2vec)

    #implement early stopping
    callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
    print(train_x[:3])
    print(train_y[:3])
    #train model
    model.fit(	train_x, 
                train_y, 
                epochs=2, # TODO they had 100000
                callbacks=callbacks,
                validation_split=0.1, 
                batch_size=1024, 
                shuffle=True, 
                verbose=3)
    #model.save('checkpoints/lol')
    #model = load_model('checkpoints/lol')

    #evaluate model
    y_pred = model.predict(test_x)
    print(y_pred)
    print(test_y)
    test_y_cat = one_hot_to_categorical(test_y)
    y_pred_cat = one_hot_to_categorical(y_pred)
    acc = accuracy_score(test_y_cat, y_pred_cat)

    #clean memory???
    train_x, train_y = None, None
    gc.collect()

    #return the accuracy
    #print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
    return acc