# **Initialization**





In [None]:
VERSION = 'MPQA2.0_v221219_cleaned'
DATA = VERSION + 'addedSpanToHead'
SAVE_NAME = DATA +'_addedSynonyms'
SEED = 0

RUNTIME_TYPE = 'COLAB'
EXPERIMENT_NAME = 'test'
REPEAT_TIME = 1 #4

In [None]:
import nltk

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import os
import random
import torch
import numpy as np
import json
from urllib.request import urlopen
from datetime import datetime
from itertools import chain
from nltk.corpus import wordnet
from os import chdir

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# To assure deterministic results
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"

In [None]:
# Support for third-party widgets
if RUNTIME_TYPE == 'COLAB':
    from google.colab import output
    output.enable_custom_widget_manager()
    from google.colab import drive

# **Functions**

In [None]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_heads(csds_object, text, head):
    heads = []
    if head.replace(' ', '') != '':
        heads.append(head)

    # nested source
    nested_sources = csds_object['nested_source']
    if len(nested_sources) > 0:
        i = len(nested_sources) - 1
        while i >= 0:
            if len(nested_sources[i].keys()) == 0:
                break
            if i == 0:
                if len(nested_sources[i].keys()) == 0:
                    break
                nslink = csds_object['nested_source_link'][i]
                if nslink.split('&&')[1] == 'agent-w':
                    break
            if nested_sources[i]['clean_head'].replace(' ', '') != '':
                heads.append(nested_sources[i]['clean_head'])
            i = i - 1
    
    # target
    target = csds_object['target']
    if len(target) > 0:
        i = len(target) - 1
        while i >= 0:
            if len(target[i].keys()) == 0:
                break
            if i == 0:
                if len(target[i].keys()) == 0:
                    break
            if target[i]['clean_head'].replace(' ', '') != '':
                heads.append(target[i]['clean_head'])
            i = i - 1
    
    # attitude
    attitude = csds_object['attitude']
    if len(attitude) > 0:
        i = len(attitude) - 1
        while i >= 0:
            if len(attitude[i].keys()) == 0:
                break
            if i == 0:
                if len(attitude[i].keys()) == 0:
                    break
            if attitude[i]['clean_head'].replace(' ', '') != '':
                heads.append(attitude[i]['clean_head'])
            i = i - 1

    return heads

In [None]:
def add_synonym(clean_text, heads):
    arr_synonym = []
    str_heads = ''
    dict_synonym = {}
    th_text = 1 
    th_word = 1

    split_text = clean_text.split(' ')

    if len(split_text) > th_text:
        for i in range(len(heads)):
            str_heads += heads[i]
            if i <= len(heads)-2:
                str_heads += ' '

        split_heads = str_heads.split(' ')

        for i in range(len(split_text)):
            word = split_text[i]
            if not(word in split_heads):
                if len(word) > th_word:
                    synonyms = wordnet.synsets(word)
                    lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
                    if len(lemmas) > 0:
                        dict_synonym[str(i)] = []
                        for item in lemmas:
                            if item != word and item.find('_') == -1 and len(item) > th_word and not(item.isupper()): #and len(item) == len(word):
                                dict_synonym[str(i)].append(item)

        for rp in range(REPEAT_TIME):
            temp_text = split_text.copy()
            for i in range(len(split_text)):
                if str(i) in dict_synonym.keys():
                    if rp < len(dict_synonym[str(i)]):
                        temp_text[i] = (dict_synonym[str(i)])[rp]

            if temp_text != split_text:
                temp_arr = ' '.join(temp_text)
                if not(temp_arr in arr_synonym):
                    arr_synonym.append(temp_arr)

    return arr_synonym

# **Read data**

In [None]:
set_seed()

In [None]:
# Set destination folder
if RUNTIME_TYPE == 'COLAB':
  drive.mount('/content/drive')
  if not os.path.exists('drive/MyDrive/new-csds-newV'):
    os.makedirs('drive/MyDrive/new-csds-newV')
  chdir('drive/MyDrive/new-csds-newV')
else:
  if not os.path.exists('new-csds-newV'):
    os.makedirs('new-csds-newV')
  chdir('new-csds-newV')

Mounted at /content/drive


In [None]:
# Getting data & augmented data urls
data_name_to_google_drive_data_url = {
    'MPQA2.0_v221219_cleanedaddedSpanToHead': 'https://drive.google.com/file/d/1cWzWDNScc1QOCH1ojJaY0wW4oPzVKSn1/view?usp=share_link'
}

# Get direct download link
def get_download_url_from_google_drive_url(google_drive_url):
    return f'https://drive.google.com/uc?id={google_drive_url.split("/")[5]}&export=download&confirm=t'

In [None]:
# Read MPQA data
google_drive_data_url = data_name_to_google_drive_data_url[DATA]
data_url = get_download_url_from_google_drive_url(google_drive_data_url)
response = urlopen(data_url)
csds_collection = json.loads(response.read())
csds_objects = csds_collection['csds_objects']

In [None]:
new_data = {}
new_data['corpus_name'] = csds_collection['corpus_name']
new_data['agent_objects'] = csds_collection['agent_objects']
new_data['target_objects'] = csds_collection['target_objects']

# **Add Synonym(s)**

In [None]:
# Find parts of csds elements that should be saved (clean head of text, nested source, target, agent), then call synonym method for add synonym(s) for each element
new_csds_objects = []
counter = 0

for csds_object in csds_objects:
    text = csds_object['clean_text']
    head = csds_object['clean_head']

    if text.replace(' ', '') != '':
        heads = get_heads(csds_object, text, head)
        csds_object['synonyms'] = add_synonym(text, heads)
    else:
        csds_object['synonyms'] = []
        counter += 1

    # Add to new dictionary
    new_csds_objects.append(csds_object)

print(counter)

52


In [None]:
# Save new csds objects
new_data['csds_objects'] = new_csds_objects

del new_csds_objects, csds_objects

# **Save data**

In [None]:
with open(SAVE_NAME+'.json', 'w', encoding='utf-8') as f:
    json.dump(new_data, f, ensure_ascii=False, indent=4)