# **Initialization**





In [None]:
SEED = 0
READ_DATA_ONLINE = False

RUNTIME_TYPE = 'COLAB'
EXPERIMENT_NAME = 'test'
K_FOLD = 5
REPEAT_TIME = 1 #4

In [None]:
import nltk

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import os
import random
import torch
import numpy as np
import json
from urllib.request import urlopen
from datetime import datetime
from itertools import chain
from nltk.corpus import wordnet
from os import chdir

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# To assure deterministic results
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"

In [None]:
# Support for third-party widgets
if RUNTIME_TYPE == 'COLAB':
    from google.colab import output
    output.enable_custom_widget_manager()
    from google.colab import drive

# **Functions**

In [None]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_heads(text, annots):
    heads = []
    for item in annots:
        heads.extend(text[item[2]:item[3]+1])
        
    return heads

In [None]:
def add_synonym(split_text, split_heads):
    arr_synonym = []
    dict_synonym = {}
    th_text = 1
    th_word = 1

    if len(split_text) > th_text:
        for i in range(len(split_text)):
            word = split_text[i]
            if not(word in split_heads):
                if len(word) > th_word:
                    synonyms = wordnet.synsets(word)
                    lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
                    if len(lemmas) > 0:
                        dict_synonym[str(i)] = []
                        for item in lemmas:
                            if item != word and item.find('_') == -1 and len(item) > th_word and not(item.isupper()): #and len(item) == len(word):
                                dict_synonym[str(i)].append(item)

        for rp in range(REPEAT_TIME):
            temp_text = split_text.copy()
            for i in range(len(split_text)):
                if str(i) in dict_synonym.keys():
                    if rp < len(dict_synonym[str(i)]):
                        temp_text[i] = (dict_synonym[str(i)])[rp]

            if temp_text != split_text:
                if not(temp_text in arr_synonym):
                    arr_synonym.append(temp_text)

    return arr_synonym

# **Read data**

In [None]:
set_seed()

In [None]:
# Set destination folder
if RUNTIME_TYPE == 'COLAB':
  drive.mount('/content/drive')
  if not os.path.exists('drive/MyDrive/augment-chinese-dataset-newV'):
    os.makedirs('drive/MyDrive/augment-chinese-dataset-newV')
  chdir('drive/MyDrive/augment-chinese-dataset-newV')
else:
  if not os.path.exists('augment-chinese-dataset-newV'):
    os.makedirs('augment-chinese-dataset-newV')
  chdir('augment-chinese-dataset-newV')

Mounted at /content/drive


In [None]:
# Dataset public URL
data_name_to_google_drive_url = {
    '0.aaai19srl.train0.conll.json': 'https://drive.google.com/file/d/1-_eoQ3uoOBYj8cUum8mNa5wZtl6QBgOi/view?usp=share_link',
    '0.aaai19srl.dev0.conll.json': 'https://drive.google.com/file/d/1-fsiHQGfpScZv6XMAmlDgnvWzAEqZGjv/view?usp=share_link',
    '0.aaai19srl.test0.conll.json': 'https://drive.google.com/file/d/1-gw0li7UF-LxW4XoqrHvjIFmh2-7IqPa/view?usp=share_link',

    '1.aaai19srl.train1.conll.json':'https://drive.google.com/file/d/1-XZOXPKhuhoG3LlSVMNp78aOmYeG9hNa/view?usp=share_link',
    '1.aaai19srl.dev1.conll.json':'https://drive.google.com/file/d/1-e_3D3wSKe5PKXQRimTa99V7S0uI-3cV/view?usp=share_link',
    '1.aaai19srl.test1.conll.json':'https://drive.google.com/file/d/1-dNGqiBPaMIEGP5ga08s2GRfWo99UX5C/view?usp=share_link',

    '2.aaai19srl.train2.conll.json': 'https://drive.google.com/file/d/1-nCWw4by4KmbFuQ51_4Cp7gtd2MOFwcj/view?usp=share_link',
    '2.aaai19srl.dev2.conll.json': 'https://drive.google.com/file/d/1-pKju4UG04NrjTV6sySPWXoNljUbrWcJ/view?usp=share_link',
    '2.aaai19srl.test2.conll.json': 'https://drive.google.com/file/d/1-lA8KFec-ZJVuEVN5Odp7mtC2_DAOy2W/view?usp=share_link',

    '3.aaai19srl.train3.conll.json': 'https://drive.google.com/file/d/1-jcqqZGYMZdUOcKdWqrqt8L0Jfq6Rb-T/view?usp=share_link',
    '3.aaai19srl.dev3.conll.json': 'https://drive.google.com/file/d/1-kKx2l42_AJ09ZMTsNH35Gvn-4cCivh0/view?usp=share_link',
    '3.aaai19srl.test3.conll.json': 'https://drive.google.com/file/d/1-pYnyOfepaq0vkxAA5gr13kgF_Gq-suV/view?usp=share_link',

    '4.aaai19srl.train4.conll.json': 'https://drive.google.com/file/d/1-dpskoeMgcz-s_F1eEiDEvsyfofaRtKK/view?usp=share_link',
    '4.aaai19srl.dev4.conll.json': 'https://drive.google.com/file/d/1-eh7dY7vNY34c_ZSTYNeWUrrI7-gLXbb/view?usp=share_link',
    '4.aaai19srl.test4.conll.json': 'https://drive.google.com/file/d/1-ZzlosdvROC-kqphi44q1loowRi69uSh/view?usp=share_link'
}

# Get direct download link
def get_download_url_from_google_drive_url(google_drive_url):
    return f'https://drive.google.com/uc?id={google_drive_url.split("/")[5]}&export=download&confirm=t'

# **Add Synonym(s)**

In [None]:
train_data_all_folds = {}

for k in range(1, K_FOLD+1):
    print('fold: ', k)
    train_data_name = f'{k-1}.aaai19srl.train{k-1}.conll.json'
    dev_data_name = f'{k-1}.aaai19srl.dev{k-1}.conll.json'
    test_data_name = f'{k-1}.aaai19srl.test{k-1}.conll.json'

    # Extract train data samples
    google_drive_url = data_name_to_google_drive_url[train_data_name]
    data_url = get_download_url_from_google_drive_url(google_drive_url)
    response = urlopen(data_url)
    dd = response.readlines()
    train_data = []

    for line in dd:
        train_data.append(json.loads(line.decode()))
    print(f'Size of train data: {len(train_data)}')

    for item in train_data:
        sentence = item['sentences']
        annots = item['orl']
        if not str([sentence, annots]) in train_data_all_folds:
            train_data_all_folds[str([sentence, annots])] = item

    # Extract dev data samples
    google_drive_url = data_name_to_google_drive_url[dev_data_name]
    data_url = get_download_url_from_google_drive_url(google_drive_url)
    response = urlopen(data_url)
    dd = response.readlines()
    dev_data = []

    for line in dd:
        dev_data.append(json.loads(line))
    print(f'Size of dev data: {len(dev_data)}')

    # Extract test data samples
    google_drive_url = data_name_to_google_drive_url[test_data_name]
    data_url = get_download_url_from_google_drive_url(google_drive_url)
    response = urlopen(data_url)
    dd = response.readlines()
    test_data = []
    
    for line in dd:
        test_data.append(json.loads(line))

    print(f'Size of test data: {len(test_data)}')
    print(f'All: {len(train_data)+len(dev_data)+len(test_data)}\n')

fold:  1
Size of train data: 2449
Size of dev data: 1038
Size of test data: 625
All: 4112

fold:  2
Size of train data: 2455
Size of dev data: 1038
Size of test data: 628
All: 4121

fold:  3
Size of train data: 2462
Size of dev data: 1038
Size of test data: 623
All: 4123

fold:  4
Size of train data: 2553
Size of dev data: 1038
Size of test data: 532
All: 4123

fold:  5
Size of train data: 2396
Size of dev data: 1038
Size of test data: 687
All: 4121



In [None]:
print(len(train_data_all_folds))

3104


In [None]:
# Find parts of data elements that should be saved, then call synonym method for add synonym(s) for each element
train_data_after_synonym = {}

for item in train_data_all_folds:
    text = (train_data_all_folds[item])['sentences']
    annots = (train_data_all_folds[item])['orl']
    heads = get_heads(text, annots)
    arr_synonym = add_synonym(text, heads)

    if not (str([text, annots]) in train_data_after_synonym):
        train_data_after_synonym[str([text, annots])] = []
        train_data_after_synonym[str([text, annots])].append(train_data_all_folds[item])
        
    if arr_synonym != []:
        for rp in range(REPEAT_TIME):
            if rp < len(arr_synonym):
                train_data_after_synonym[str([text, annots])].append({'sentences': arr_synonym[rp], 'orl': annots})

# **Save data**

In [None]:
for k in range(1, K_FOLD+1):
    print('\nfold: ', k)
    train_data_name = f'{k-1}.aaai19srl.train{k-1}.conll.json'
   
    # Extract dev data samples
    google_drive_url = data_name_to_google_drive_url[train_data_name]
    data_url = get_download_url_from_google_drive_url(google_drive_url)
    response = urlopen(data_url)
    dd = response.readlines()
    train_data = []
    train_data_new = []

    for line in dd:        
        train_data.append(json.loads(line))
    print(f'Size of trainset: {len(train_data)}')

    for item in train_data:
        sentence = item['sentences']
        annots = item['orl']
        for item in train_data_after_synonym[str([sentence, annots])]:
            train_data_new.append(item)
    print(f'Size of new trainset: {len(train_data_new)}')

    with open(train_data_name, 'w') as fp:
        fp.write(
        '\n'.join(json.dumps(item) for item in train_data_new) +
        '\n')


fold:  1
Size of trainset: 2449
Size of new trainset: 4707

fold:  2
Size of trainset: 2455
Size of new trainset: 4726

fold:  3
Size of trainset: 2462
Size of new trainset: 4733

fold:  4
Size of trainset: 2553
Size of new trainset: 4909

fold:  5
Size of trainset: 2396
Size of new trainset: 4619
