## Import Libraries

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import nltk
import torch
from collections import Counter
from sklearn.utils import shuffle
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
import random
import time

sys.path.append('eda_nlp/')
from eda_nlp.augment import gen_eda
from eda_nlp.eda import get_only_chars

### Unimportant for now

In [2]:
random.seed(42)

eval_ent_dict = {"O": 0,
                 "Term": 1,
                 "Definition": 2,
                 "Alias-Term": 3,
                 "Referential-Definition": 4,
                 "Referential-Term": 5,
                 "Qualifier": 6}

inv_eval_ent_dict = {0: "O",
                     1: "Term",
                     2: "Definition",
                     3: "Alias-Term",
                     4: "Referential-Definition",
                     5: "Referential-Term",
                     6: "Qualifier"}


keep_def_prob = 1
keep_O_prob = 1

In [3]:
def decode_token(token, tokenizer, lang_model):
    if "roberta" in lang_model:
        return tokenizer.decode(token).replace(" ", "")
    elif "scibert" in lang_model:
        token_word = tokenizer.decode(token).lower()
        return token_word if token_word[:2] != "##" else token_word[2:]
    elif "xlnet" in lang_model:
        return tokenizer.decode(token)
    elif "albert-base" in lang_model:
        return tokenizer.decode(token)
    elif "bert" in lang_model:
        token_word = tokenizer.decode(token)
        return token_word if token_word[:2] != "##" else token_word[2:]

## Preprocessing methods

In [4]:
def remove_weird_letters(word, lang_model):
    replacements_chars = [("ö", "o"), ("é", "e"), ("ê", "e"), ("ü", "u"),
                          ("ó", "o"), ("â", "a"), ("ä", "a"), ("à", "a"),
                          ("ç", "c"), ("ï", "i"), ("ô", "o"), ("û", "u"),
                          ("ÿ", "y"), ("á", "a")]
    
    if "scibert" in lang_model:
        word = word.lower()
        
        for init_char, repl_char in replacements_chars:
            word = word.replace(init_char, repl_char)
    elif "albert" in lang_model:
        word = word.lower()
    elif "xlnet" in lang_model:
        for init_char, repl_char in replacements_chars:
            word = word.replace(init_char, repl_char)
            word = word.replace(init_char.upper(), repl_char.upper())
        
    return word


def correct_punctuation(word, lang_model):
    word = word.replace("“", "\"").replace("”", "\"").replace("’", "'").replace("‘", "'") \
                               .replace(",", ",").replace("⋅", "*").replace("—", "-").replace("`", "'")
                               # replace("…", "...").replace("º", "")
    
    if "scibert" in lang_model:
        word = word.replace("º", "*")
    elif "roberta" in lang_model:
        word = word.replace("º", "")
    elif "xlnet" in lang_model:
        word = word.replace("…", "...")
                               
    return word


def remove_greek(word, lang_model, tokenizer):
    greek_alphabet = 'αβγδεζηθικλμνξοπρςστυφχψω' + 'αβγδεζηθικλμνξοπρςστυφχψω'.upper() + "∆"
    
    for letter in greek_alphabet:
        word = tokenizer.unk_token if letter in word else word
        
    return word

## Main processing method

In [5]:
def process_data(path, data_kind="train"):
    
    cols = ["SentenceNumber", "Word", "Tag"]
    rows = []
    sentence_counter = 1
    
    number_of_files = len(os.listdir(path))

    print("Tracking Progress: Current File Number / Total Files")
    for file_ct, filename in enumerate(os.listdir(path)):
        
        print(file_ct+1, "/", number_of_files, end=", ")
        
        with open(os.path.join(path, filename), "r", encoding="utf-8") as file:

            for line in file:
                if line != "\n":
                    tokens = line.split()
                    word, entity = tokens[0], tokens[4] 

                    rows.append({"SentenceNumber": sentence_counter, # if it is not the above than append to dataset
                                 "Word": word,
                                 "Tag": entity})
                else:
                    sentence_counter += 1
            
    # creating dataframe
    df = pd.DataFrame(rows, columns=cols)
    # Writing dataframe to csv
    df.to_csv("data/{kind}.csv".format(kind=data_kind))
    
    # Adding EntityNumber field - needed for augmentation
    df['EntityNumber'] = None

    df.loc[(df['Tag'] != df['Tag'].shift(1)) & (df['Tag'].str[0] != 'I'), 'EntityNumber'] =\
        np.arange(len(df.loc[(df['Tag'] != df['Tag'].shift(1)) & (df['Tag'].str[0] != 'I')]))
    df['EntityNumber'] = df['EntityNumber'].fillna(method='ffill')
    
    return df

## Execution

In [146]:
kind = "test" # train / validation / test

base_path = r"C:\Users\pkozminski\Documents\Studia\deft_corpus\data"
if not os.path.exists(base_path):
    base_path = "C:/Users/Hasan/Desktop/Barcelona_Data/FIB_Courses/HLE/Project/DeftEval2020/code/deft_corpus/data"
if not os.path.exists(base_path):
    raise ValueError("Provide an existing path to the data directory")

if kind=="train":
    path = os.path.join(base_path, r"deft_files\train")
elif kind=="validation":
    path = os.path.join(base_path, r"deft_files\dev")
elif kind=="test":
    path = os.path.join(base_path, r"test_files\labeled\subtask_2")
else:
    print("For the data kind to process, please specify one of train/validation/test")
    
df = process_data(path, data_kind=kind)

Tracking Progress: Current File Number / Total Files
1 / 67, 2 / 67, 3 / 67, 4 / 67, 5 / 67, 6 / 67, 7 / 67, 8 / 67, 9 / 67, 10 / 67, 11 / 67, 12 / 67, 13 / 67, 14 / 67, 15 / 67, 16 / 67, 17 / 67, 18 / 67, 19 / 67, 20 / 67, 21 / 67, 22 / 67, 23 / 67, 24 / 67, 25 / 67, 26 / 67, 27 / 67, 28 / 67, 29 / 67, 30 / 67, 31 / 67, 32 / 67, 33 / 67, 34 / 67, 35 / 67, 36 / 67, 37 / 67, 38 / 67, 39 / 67, 40 / 67, 41 / 67, 42 / 67, 43 / 67, 44 / 67, 45 / 67, 46 / 67, 47 / 67, 48 / 67, 49 / 67, 50 / 67, 51 / 67, 52 / 67, 53 / 67, 54 / 67, 55 / 67, 56 / 67, 57 / 67, 58 / 67, 59 / 67, 60 / 67, 61 / 67, 62 / 67, 63 / 67, 64 / 67, 65 / 67, 66 / 67, 67 / 67, 

Term
Definition
Alias-Term
Referential-Definition
Referential-Term
Qualifier

# Data augmentation
The augmentation will be performed by EDA ([Easy Data Augmentation](https://github.com/jasonwei20/eda_nlp)) procedure, involving only synonym replacing method.

Data transformations are as follows:
- The tokens corresponding to the entities are pasted into one sequence
- Sequences are passed into the augmenting function and replicated several times
- All the replicated versions are later ordered in the same order as at the beginning
- The sequences are tokenized with `word_tokenize` function

A drawback is that the sentences lose interpunction and upper letters. 

#### Main part

In [148]:
df_train = process_data(os.path.join(base_path, r"deft_files\train"), data_kind='train')

Tracking Progress: Current File Number / Total Files
1 / 80, 2 / 80, 3 / 80, 4 / 80, 5 / 80, 6 / 80, 7 / 80, 8 / 80, 9 / 80, 10 / 80, 11 / 80, 12 / 80, 13 / 80, 14 / 80, 15 / 80, 16 / 80, 17 / 80, 18 / 80, 19 / 80, 20 / 80, 21 / 80, 22 / 80, 23 / 80, 24 / 80, 25 / 80, 26 / 80, 27 / 80, 28 / 80, 29 / 80, 30 / 80, 31 / 80, 32 / 80, 33 / 80, 34 / 80, 35 / 80, 36 / 80, 37 / 80, 38 / 80, 39 / 80, 40 / 80, 41 / 80, 42 / 80, 43 / 80, 44 / 80, 45 / 80, 46 / 80, 47 / 80, 48 / 80, 49 / 80, 50 / 80, 51 / 80, 52 / 80, 53 / 80, 54 / 80, 55 / 80, 56 / 80, 57 / 80, 58 / 80, 59 / 80, 60 / 80, 61 / 80, 62 / 80, 63 / 80, 64 / 80, 65 / 80, 66 / 80, 67 / 80, 68 / 80, 69 / 80, 70 / 80, 71 / 80, 72 / 80, 73 / 80, 74 / 80, 75 / 80, 76 / 80, 77 / 80, 78 / 80, 79 / 80, 80 / 80, 

In [149]:
def set_sentence(s):
    return ''.join(list(map(lambda x: ' ' + x if x[0].isalpha() else x, s)))

def tokenize_sentences_to_df(df):
    df_augmented_expanded = pd.DataFrame(columns=['SentenceNumber', 'Version', 'Tag', 'Word'])
    for i, row in df.iterrows():
        sentence_tokens = word_tokenize(row['Sentence'])
        labels = [row['Label']] + ['I' + row['Label'][1:]]*(len(sentence_tokens)-1)
        df_augmented_expanded = pd.concat((
            df_augmented_expanded,
            pd.DataFrame({"EntityNumber": row['EntityNumber'],
                          "SentenceNumber": row['SentenceNumber'],
                          "Version": row['Version'],
                          "Tag": labels,
                          "Word": sentence_tokens})
        ), ignore_index=True)
    return df_augmented_expanded

def prepare_original_dataset_to_add_augmentation(df, version, condition):
    df['Version'] = version
    return df.loc[condition]

def oversample(input_df, n):
    """
    input_df must contain columns EntityNumber, SentenceNumber, Label, and Sentence
    n - number of each sentence duplications
    """
    result = pd.DataFrame(columns=['EntityNumber', 'SentenceNumber', 'Label', 'Sentence', 'Version'])
    for i, row in input_df.iterrows():
        entity_number = row['EntityNumber']
        sentence_number = row['SentenceNumber']
        label = row['Label']
        sentence = row['Sentence']
        aug_sentences = [sentence]*n
        for aug_i, aug_sentence in enumerate(aug_sentences):
            result = pd.concat((result, pd.DataFrame({"EntityNumber": entity_number,
                                                      "SentenceNumber": sentence_number,
                                                      "Label": label,
                                                      "Sentence": aug_sentence,
                                                      "Version": aug_i
                                                      }, index=[0])),
                               ignore_index=True)

    return result

def add_data(df,
             label,
             mode = 'augmentation',
             augment_only_this_label = False,
             alpha_sr = 0.5,
             num_adds = 10):
    """
    Function adding the data
    """
    
    if mode == 'augmentation':
        gen_fun = gen_eda
        gen_fun_args = dict(alpha_sr=alpha_sr, num_aug=num_adds, alpha_ri=0, alpha_rs=0, alpha_rd=0)
    elif mode == 'oversampling':
        gen_fun = oversample
        gen_fun_args = dict(n=num_adds)
        augment_only_this_label = True
    else:
        raise ValueError('Wrong mode')

    # All tokens marked with the label
    df_label = df.loc[df['Tag'].str[2:] == label]

    # All sentences containining those entities
    sentences_with_entity = df.loc[df['SentenceNumber'].isin(df_label['SentenceNumber'])]

    # Sequences squizzed into one row - preparation for augmenting operations
    all_sentences_entities = sentences_with_entity\
        .groupby('EntityNumber')\
        .agg(Sentence=('Word', set_sentence), Label=('Tag', 'first'), SentenceNumber=('SentenceNumber', 'first'))\
        .reset_index()

    # Dropping the sequences with no interest, they will not be augmented
    if augment_only_this_label:
        condition = all_sentences_entities['Label'].str[2:] != label
    else:  
        condition = all_sentences_entities['Label'] == 'O'
    sentences_with_entity_to_aug = all_sentences_entities.loc[~condition]  
    sentences_to_drop = sentences_with_entity_to_aug.loc[
        sentences_with_entity_to_aug['Sentence'].apply(get_only_chars).str.len() == 0,
        'SentenceNumber'
    ]
    
    if mode == 'augmentation':
        sentences_with_entity_to_aug = sentences_with_entity_to_aug.loc[
            ~sentences_with_entity_to_aug['SentenceNumber'].isin(sentences_to_drop)
        ]


    # Augmentation
    all_sentences_augmented = gen_fun(sentences_with_entity_to_aug, **gen_fun_args)

    # List of augmentation indices
    aug_versions = all_sentences_augmented['Version'].unique()
    
    # The sequences are being sorted and prepared for tokenization
    ordered_augmented_sequences = pd.concat(
        [prepare_original_dataset_to_add_augmentation(all_sentences_entities, i, condition) for i in aug_versions]+
        [all_sentences_augmented]
    ).sort_values(['Version', 'EntityNumber'])
    
    # Coming back to the original shape
    all_sequences_augmented_expanded = tokenize_sentences_to_df(ordered_augmented_sequences)
    all_sequences_augmented_expanded['SentenceNumber'] = all_sequences_augmented_expanded['SentenceNumber'].astype(str) + '-' + all_sequences_augmented_expanded['Version'].astype(str)
    
    return all_sequences_augmented_expanded[['SentenceNumber', 'Word', 'Tag', 'EntityNumber', 'Version']]

#### Original data:

In [150]:
df_train.loc[df_train['SentenceNumber'] == 242].head(15)

Unnamed: 0,SentenceNumber,Word,Tag,EntityNumber
3292,242,They,B-Referential-Term,268
3293,242,are,O,269
3294,242,short,B-Definition,270
3295,242,",",I-Definition,270
3296,242,hair,I-Definition,270
3297,242,-,I-Definition,270
3298,242,like,I-Definition,270
3299,242,structures,I-Definition,270
3300,242,that,I-Definition,270
3301,242,are,I-Definition,270


In [151]:
df_train['Tag'].loc[df_train['Tag'].str.startswith('B')].value_counts()

B-Term                      6611
B-Definition                6062
B-Alias-Term                 726
B-Secondary-Definition       479
B-Referential-Definition     308
B-Qualifier                  162
B-Referential-Term           140
B-Definition-frag             85
B-Term-frag                    8
B-Ordered-Term                 5
B-Ordered-Definition           5
B-Alias-Term-frag              3
Name: Tag, dtype: int64

#### Augmented data:

In [152]:
only_augmented_data = add_data(df_train, 'Referential-Term')

In [153]:
only_augmented_data.loc[only_augmented_data['SentenceNumber'] == "477-0"].head(15)

Unnamed: 0,SentenceNumber,Word,Tag,EntityNumber,Version
69,477-0,these,B-Referential-Term,570.0,0
70,477-0,speck,I-Referential-Term,570.0,0
71,477-0,supporter,B-Secondary-Definition,571.0,0
72,477-0,to,I-Secondary-Definition,571.0,0
73,477-0,diffuse,I-Secondary-Definition,571.0,0
74,477-0,a,I-Secondary-Definition,571.0,0
75,477-0,betoken,I-Secondary-Definition,571.0,0
76,477-0,through,I-Secondary-Definition,571.0,0
77,477-0,the,I-Secondary-Definition,571.0,0
78,477-0,cytol,I-Secondary-Definition,571.0,0


In [154]:
only_augmented_data['Tag'].loc[only_augmented_data['Tag'].str.startswith('B')].value_counts()

B-Referential-Term        560
B-Definition              520
B-Term                     64
B-Secondary-Definition     40
B-Alias-Term               16
B-Qualifier                 8
Name: Tag, dtype: int64

####  Augmented data v2 
Only the sequences of interest are augmented. Number of particular entities will not be changed in comparison with the previous approach but the non-augmented sentences are left as is.

In [155]:
only_augmented_data_v2 = add_data(df_train, 'Referential-Term', augment_only_this_label=True)

In [156]:
only_augmented_data_v2.loc[only_augmented_data_v2['SentenceNumber'] == "477-1"].head(15)

Unnamed: 0,SentenceNumber,Word,Tag,EntityNumber,Version
4123,477-1,these,B-Referential-Term,570.0,1
4124,477-1,atom,I-Referential-Term,570.0,1
4125,477-1,help,B-Secondary-Definition,571.0,1
4126,477-1,to,I-Secondary-Definition,571.0,1
4127,477-1,spread,I-Secondary-Definition,571.0,1
4128,477-1,a,I-Secondary-Definition,571.0,1
4129,477-1,signal,I-Secondary-Definition,571.0,1
4130,477-1,through,I-Secondary-Definition,571.0,1
4131,477-1,the,I-Secondary-Definition,571.0,1
4132,477-1,cytoplasm,I-Secondary-Definition,571.0,1


In [157]:
only_augmented_data_v2['Tag'].loc[only_augmented_data_v2['Tag'].str.startswith('B')].value_counts()

B-Referential-Term        560
B-Definition              520
B-Term                     64
B-Secondary-Definition     40
B-Alias-Term               16
B-Qualifier                 8
Name: Tag, dtype: int64

### Proposal of the final augmentation

In [158]:
df_train['Tag'].loc[df_train['Tag'].str.startswith('B')].value_counts()

B-Term                      6611
B-Definition                6062
B-Alias-Term                 726
B-Secondary-Definition       479
B-Referential-Definition     308
B-Qualifier                  162
B-Referential-Term           140
B-Definition-frag             85
B-Term-frag                    8
B-Ordered-Term                 5
B-Ordered-Definition           5
B-Alias-Term-frag              3
Name: Tag, dtype: int64

In [159]:
augmented_data = [
    add_data(df_train, 'Referential-Term', augment_only_this_label=True, num_adds=12),
    add_data(df_train, 'Qualifier', augment_only_this_label=True, num_adds=8),
    add_data(df_train, 'Referential-Definition', augment_only_this_label=True, num_adds=2, alpha_sr=0.2)
]

In [160]:
data_with_augmentation = pd.concat([df_train] + augmented_data)

In [161]:
data_with_augmentation['Tag'].loc[data_with_augmentation['Tag'].str.startswith('B')].value_counts()

B-Term                      7943
B-Definition                7244
B-Referential-Definition     964
B-Alias-Term                 870
B-Referential-Term           848
B-Qualifier                  840
B-Secondary-Definition       593
B-Definition-frag             89
B-Term-frag                   24
B-Alias-Term-frag             15
B-Ordered-Term                 5
B-Ordered-Definition           5
Name: Tag, dtype: int64

In [162]:
data_with_augmentation[['SentenceNumber', 'Word', 'Tag']].to_csv('data/train_augmentation.csv')

### Oversampling

In [163]:
oversampled_data = [
    add_data(df_train, 'Referential-Term', mode='oversampling',  num_adds=9),
    add_data(df_train, 'Qualifier', mode='oversampling', num_adds=8),
    add_data(df_train, 'Referential-Definition', mode='oversampling', num_adds=3),
    add_data(df_train, 'Alias-Term', mode='oversampling', num_adds=1)
]

In [164]:
data_with_oversampling = pd.concat([df_train] + oversampled_data)

In [165]:
data_with_oversampling['Tag'].loc[data_with_oversampling['Tag'].str.startswith('B')].value_counts()

B-Term                      9651
B-Definition                8665
B-Alias-Term                1700
B-Qualifier                 1534
B-Referential-Term          1420
B-Referential-Definition    1344
B-Secondary-Definition       694
B-Definition-frag             98
B-Term-frag                   41
B-Alias-Term-frag             28
B-Ordered-Term                 5
B-Ordered-Definition           5
Name: Tag, dtype: int64

In [166]:
data_with_oversampling[['SentenceNumber', 'Word', 'Tag']].to_csv('data/train_oversampling.csv')