In [None]:
# Mounting drive

# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Use if running locally
folder_path = '.'

In [None]:
# Libraries used
import spacy
import re

In [None]:
# Downloading the Spacy model
# !python -m spacy download es_core_news_sm;
!python -m spacy download es_core_news_lg;

In [None]:
# Loading model
# nlp = spacy.load('es_core_news_sm')
nlp = spacy.load('es_core_news_lg')

In [None]:
# Loading data as lines
with open(f"{folder_path}/esp.train.txt", "r") as f:
  train_lines = f.readlines()
with open(f"{folder_path}/esp.testa.txt", "r") as f:
  test_a_lines = f.readlines()
with open(f"{folder_path}/esp.testb.txt", "r") as f:
  test_b_lines = f.readlines()

In [None]:
# Strip out tags from datasets
# See: https://regex101.com/

# Regex pattern: one or more word characters (\w+) or one non-word character (\W).
def unTagger(taggedText):
  pattern = r"^(\w+|\W)(\s(.*))"
  untaggedText = []
  tag_Text = []
  for line in taggedText:
      result = re.search(pattern, line)
      if result: # Avoids non-matches, such as blank lines
        first_word = result.group(1)
        tag = result.group(3)
        untaggedText.append(first_word)
        tag_Text.append(tag)

  return untaggedText, tag_Text

train_words_unTagged, gold_tags_train = unTagger(train_lines)
test_a_words_unTagged, gold_tags_test_a = unTagger(test_a_lines)
test_b_words_unTagged, gold_tags_test_b = unTagger(test_b_lines)

In [None]:
# Function to convert list of strings into plain text
def convert_strList_to_plain_text(list_of_strings):
    plain_text = ""
    for string in list_of_strings:
        plain_text += string + " "
    return plain_text.strip()

plain_text_train = convert_strList_to_plain_text(train_words_unTagged)
plain_text_test_a = convert_strList_to_plain_text(test_a_words_unTagged)
plain_text_test_b = convert_strList_to_plain_text(test_b_words_unTagged)

In [None]:
# Saving plain texts for inspection
with open(f'{folder_path}/plain_text_train.txt', 'w') as f:
  f.write(plain_text_train)
with open(f'{folder_path}/plain_text_test_a.txt', 'w') as f:
  f.write(plain_text_test_a)
with open(f'{folder_path}/plain_text_test_b.txt', 'w') as f:
  f.write(plain_text_test_b)

In [None]:
# Setting required for long texts
nlp.max_length = 30000000

In [None]:
# Using loaded Spacy model to tag texts
tagged_document_train = nlp(plain_text_train)
tagged_document_test_a = nlp(plain_text_test_a)
tagged_document_test_b = nlp(plain_text_test_b)

In [None]:
# Extracting labels from documents tagged by loaded Spacy model
def spacyTagger(document):
  tags = []
  for token in document:
    iob_tag = token.ent_iob_
    token_type = token.ent_type_
    if iob_tag == "O":
      tags.append(iob_tag)
    elif iob_tag == "B" or iob_tag == "I":
      completeTag = iob_tag + "-" + token_type
      tags.append(completeTag)
  return tags

In [None]:
spacy_tags_train = spacyTagger(tagged_document_train)
spacy_tags_test_a = spacyTagger(tagged_document_test_a)
spacy_tags_test_b = spacyTagger(tagged_document_test_b)

In [None]:
# Combining 3 lists into a single file to feed Conlleval.py
def combine_in_doc(words_unTagged, gold_tags, spacy_tags, doc_name):
  words = words_unTagged
  gold = gold_tags
  pred = spacy_tags
  # Open a file in write mode
  with open(f"{folder_path}/{doc_name}.txt", "w") as f:
      # Loop through the length of the lists
      for i in range(len(words)):
          # Combine the ith entry of each list
          combined_entry = f"{words[i]} {gold[i]} {pred[i]}"
          # Write the combined entry to the file
          # if not i == len(list1)-1:
          f.write(combined_entry + "\n")
          # else:
            # f.write(combined_entry)
  return

In [None]:
# The .txt files generated are fed into the conlleval.py script which computes the evaluation metrics
combine_in_doc(train_words_unTagged, gold_tags_train, spacy_tags_train, 'trio_conlleval_train')
combine_in_doc(test_a_words_unTagged, gold_tags_test_a, spacy_tags_test_a, 'trio_conlleval_test_a')
combine_in_doc(test_b_words_unTagged, gold_tags_test_b, spacy_tags_test_b, 'trio_conlleval_test_b')