## Import Libraries

In [1]:
import os
import torch
from sklearn.utils import shuffle
from torch.nn.utils.rnn import pad_sequence
import random
from collections import Counter
import time
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Unimportant for now

In [2]:
random.seed(42)

eval_ent_dict = {"O": 0,
                 "Term": 1,
                 "Definition": 2,
                 "Alias-Term": 3,
                 "Referential-Definition": 4,
                 "Referential-Term": 5,
                 "Qualifier": 6}

inv_eval_ent_dict = {0: "O",
                     1: "Term",
                     2: "Definition",
                     3: "Alias-Term",
                     4: "Referential-Definition",
                     5: "Referential-Term",
                     6: "Qualifier"}


keep_def_prob = 1
keep_O_prob = 1

In [None]:
def decode_token(token, tokenizer, lang_model):
    if "roberta" in lang_model:
        return tokenizer.decode(token).replace(" ", "")
    elif "scibert" in lang_model:
        token_word = tokenizer.decode(token).lower()
        return token_word if token_word[:2] != "##" else token_word[2:]
    elif "xlnet" in lang_model:
        return tokenizer.decode(token)
    elif "albert-base" in lang_model:
        return tokenizer.decode(token)
    elif "bert" in lang_model:
        token_word = tokenizer.decode(token)
        return token_word if token_word[:2] != "##" else token_word[2:]

## Preprocessing methods

In [3]:
def remove_weird_letters(word, lang_model):
    replacements_chars = [("ö", "o"), ("é", "e"), ("ê", "e"), ("ü", "u"),
                          ("ó", "o"), ("â", "a"), ("ä", "a"), ("à", "a"),
                          ("ç", "c"), ("ï", "i"), ("ô", "o"), ("û", "u"),
                          ("ÿ", "y"), ("á", "a")]
    
    if "scibert" in lang_model:
        word = word.lower()
        
        for init_char, repl_char in replacements_chars:
            word = word.replace(init_char, repl_char)
    elif "albert" in lang_model:
        word = word.lower()
    elif "xlnet" in lang_model:
        for init_char, repl_char in replacements_chars:
            word = word.replace(init_char, repl_char)
            word = word.replace(init_char.upper(), repl_char.upper())
        
    return word


def correct_punctuation(word, lang_model):
    word = word.replace("“", "\"").replace("”", "\"").replace("’", "'").replace("‘", "'") \
                               .replace(",", ",").replace("⋅", "*").replace("—", "-").replace("`", "'")
                               # replace("…", "...").replace("º", "")
    
    if "scibert" in lang_model:
        word = word.replace("º", "*")
    elif "roberta" in lang_model:
        word = word.replace("º", "")
    elif "xlnet" in lang_model:
        word = word.replace("…", "...")
                               
    return word


def remove_greek(word, lang_model, tokenizer):
    greek_alphabet = 'αβγδεζηθικλμνξοπρςστυφχψω' + 'αβγδεζηθικλμνξοπρςστυφχψω'.upper() + "∆"
    
    for letter in greek_alphabet:
        word = tokenizer.unk_token if letter in word else word
        
    return word

## Main processing method

In [6]:
def process_data(path, data_kind="train"):
    
    cols = ["Sentence #", "Word", "Tag"]
    rows = []
    sentence_counter = 1
    
    number_of_files = len(os.listdir(path))

    print("Tracking Progress: Current File Number / Total Files")
    for file_ct, filename in enumerate(os.listdir(path)):
        
        print(file_ct+1, "/", number_of_files, end=", ")
        
        with open(os.path.join(path, filename), "r", encoding="utf-8") as file:

            for line in file:
                if line != "\n":
                    tokens = line.split()
                    word, entity = tokens[0], tokens[4] 

                    rows.append({"Sentence #": sentence_counter, # if it is not the above than append to dataset
                                 "Word": word,
                                 "Tag": entity})
                else:
                    sentence_counter += 1
            
    # creating dataframe
    df = pd.DataFrame(rows, columns=cols)

    # Writing dataframe to csv
    df.to_csv("data/{kind}.csv".format(kind=data_kind))

## Execution

In [8]:
kind = "test" # train / validation / test

if kind=="train":
    path = "C:/Users/Hasan/Desktop/Barcelona_Data/FIB_Courses/HLE/Project/DeftEval2020/code/deft_corpus/data/deft_files/train"
elif kind=="validation":
    path = "C:/Users/Hasan/Desktop/Barcelona_Data/FIB_Courses/HLE/Project/DeftEval2020/code/deft_corpus/data/deft_files/dev"
elif kind=="test":
    path = "C:/Users/Hasan/Desktop/Barcelona_Data/FIB_Courses/HLE/Project/DeftEval2020/code/deft_corpus/data/test_files/labeled/subtask_2"
else:
    print("For the data kind to process, please specify one of train/validation/test")
    
process_data(path, data_kind=kind)

Tracking Progress: Current File Number / Total Files
1 / 67, 2 / 67, 3 / 67, 4 / 67, 5 / 67, 6 / 67, 7 / 67, 8 / 67, 9 / 67, 10 / 67, 11 / 67, 12 / 67, 13 / 67, 14 / 67, 15 / 67, 16 / 67, 17 / 67, 18 / 67, 19 / 67, 20 / 67, 21 / 67, 22 / 67, 23 / 67, 24 / 67, 25 / 67, 26 / 67, 27 / 67, 28 / 67, 29 / 67, 30 / 67, 31 / 67, 32 / 67, 33 / 67, 34 / 67, 35 / 67, 36 / 67, 37 / 67, 38 / 67, 39 / 67, 40 / 67, 41 / 67, 42 / 67, 43 / 67, 44 / 67, 45 / 67, 46 / 67, 47 / 67, 48 / 67, 49 / 67, 50 / 67, 51 / 67, 52 / 67, 53 / 67, 54 / 67, 55 / 67, 56 / 67, 57 / 67, 58 / 67, 59 / 67, 60 / 67, 61 / 67, 62 / 67, 63 / 67, 64 / 67, 65 / 67, 66 / 67, 67 / 67, 