# **I. Dataset Preparation**

These procedures are implemented to prepare the COPIOUS Corpus dataset for relabion labeling. The currec format of `.txt` and `.ann` files does not support relation annotations.

## **I-A. Mounting the COPIOUS Dataset**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# folder path
folder = "/content/drive/My Drive/CS_198/copious_published/copious_published/"
subfolders = ["train","dev","test"]

Mounted at /content/drive


## **I-B. Extracting the Data**

These lines of code here aims to initially split the full raw text per file into sentences, and only those sentences where entities as labeled in the `.ann` file are preserved in a dataframe and exported into Google Sheets.

In [None]:
## Single sentence
import os
import pandas as pd
import nltk


nltk.download('punkt_tab')

# Initialize storage for all entities
all_entities = []

# Function to find the sentence containing the entity
def find_sentence(text, start, end):
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        if text[start:end] in sentence:
            return sentence.strip()
    return "No Sentence"

folder_path = "/content/drive/My Drive/CS_198/copious_published/copious_published/train"
# Process each .ann and corresponding .txt file
for file_name in os.listdir(folder_path):
    if file_name.endswith(".ann"):
        base_name = file_name[:-4]  # Remove .ann extension
        txt_file_path = os.path.join(folder_path, base_name + ".txt")
        ann_file_path = os.path.join(folder_path, file_name)

        with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
            text = txt_file.read()

        with open(ann_file_path, 'r', encoding='utf-8') as ann_file:
            annotations = ann_file.readlines()

        # Parse entities
        entities = []
        for annotation in annotations:
            if annotation.startswith('T'):
                parts = annotation.split('\t')
                entity_id, entity_info, entity_text = parts
                entity_type, spans = entity_info.split(' ', 1)

                span_text = []
                span_ranges = spans.split(';')
                for span_range in span_ranges:
                    start, end = map(int, span_range.split())
                    span_text.append(text[start:end])

                combined_text = " ".join(span_text)

                first_span_start, first_span_end = map(int, span_ranges[0].split())
                sentence = find_sentence(text, first_span_start, first_span_end)

                entities.append({
                    "File": base_name,
                    "Entity ID": entity_id,
                    "Entity Type": entity_type,
                    "Spans": spans,
                    "Text": combined_text.strip(),
                    "Sentence": sentence
                })

        # Store for all files
        all_entities.extend(entities)

# Create a DataFrame for entities
entity_df = pd.DataFrame(all_entities)

# Save to Excel in Google Drive
output_path = os.path.join("/content/drive/My Drive/CS_198/copious_published/", "sentence_entities_summary.xlsx")
entity_df.to_excel(output_path, index=False)

entity_df.head(10)

print(f"Done: {output_path}!")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Done: /content/drive/My Drive/CS_198/copious_published/sentence_entities_summary.xlsx!


## **I.C. Multi-Sentence Chunking**

Given the observation from the `.xlxs` file earlier, We decided to redo the preprocessing, where the consecutive and previous sentences are to be included into one long sequence together with the specific sentence of intertest to capture two entities in a single sentence.

Note that identitcal rows are dropped to avoid redundancy.

In [None]:
# Multi Sentence (#CHUNK SIZE 3)
import os
import pandas as pd
import nltk

nltk.download('punkt_tab')

folder = "/content/drive/My Drive/CS_198/copious_published/copious_published/"
subfolders = ["train","dev","test"]

# Function to find the sentence containing the entity
def chunk_sentences(text, chunk_size=3):
    """Split text into overlapping chunks of sentences."""
    sentences = nltk.sent_tokenize(text)
    chunks = []
    for i in range(len(sentences) - chunk_size + 1):
        chunks.append(" ".join(sentences[i:i + chunk_size]))
    return chunks


for subfolder in subfolders:

  # list for all entitites
  all_entities = []

  folder_path = os.path.join(folder, subfolder)
  for file_name in os.listdir(folder_path):
      if file_name.endswith(".ann"):
          base_name = file_name[:-4]
          txt_file_path = os.path.join(folder_path, base_name + ".txt")
          ann_file_path = os.path.join(folder_path, file_name)

          with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
              text = txt_file.read()

          with open(ann_file_path, 'r', encoding='utf-8') as ann_file:
              annotations = ann_file.readlines()

          # Tokenize text into chunks of 3 sentences
          sentence_chunks = chunk_sentences(text, chunk_size=3)

          # Parse entities
          entities_per_chunk = []
          for annotation in annotations:
              if annotation.startswith('T'):
                  parts = annotation.split('\t')
                  entity_id, entity_info, entity_text = parts
                  entity_type, spans = entity_info.split(' ', 1)

                  span_text = []
                  span_ranges = spans.split(';')
                  for span_range in span_ranges:
                      start, end = map(int, span_range.split())
                      span_text.append(text[start:end])

                  combined_text = " ".join(span_text)

                  first_span_start, first_span_end = map(int, span_ranges[0].split())

                  # Check which chunk has the entity
                  for chunk in sentence_chunks:
                      if combined_text in chunk:
                          entities_per_chunk.append({
                              "File": base_name,
                              "Chunk": chunk,
                              "Entity ID": entity_id,
                              "Entity Type": entity_type,
                              "Entity Text": combined_text.strip(),
                              "Spans": spans
                          })
                          break

          # Store entities for this specfic file
          all_entities.extend(entities_per_chunk)

  entity_df = pd.DataFrame(all_entities)
  entity_df = entity_df.drop_duplicates()

  # Save to drive as .xlsx
  output_path = os.path.join("/content/drive/My Drive/CS_198/copious_published/", f"{subfolder}_entities_summary.xlsx")
  #output_path = os.path.join("/content/drive/My Drive/CS_198/copious_published/", "chunked_entities_summary.xlsx")
  entity_df.to_excel(output_path, index=False)

  print(f"Chunked entity annotations exported to {output_path}!")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Chunked entity annotations exported to /content/drive/My Drive/CS_198/copious_published/train_entities_summary.xlsx!
Chunked entity annotations exported to /content/drive/My Drive/CS_198/copious_published/dev_entities_summary.xlsx!
Chunked entity annotations exported to /content/drive/My Drive/CS_198/copious_published/test_entities_summary.xlsx!


## **I.D. Assessing the number of samples available**

These summaries reveal that the entity types themselves are imbalanced, which might further affect the results later.

In [None]:
import os
import pandas as pd

# folder and subfolders paths
folder = "/content/drive/My Drive/CS_198/copious_published/"
subfolders = ["train", "dev", "test"]

final_summary = pd.DataFrame()

for subfolder in subfolders:
    # Load xlsx
    entity_file_path = os.path.join(folder, f"{subfolder}_entities_summary.xlsx")
    entity_df = pd.read_excel(entity_file_path)

    # Count each type occurences
    entity_type_counts = entity_df["Entity Type"].value_counts().reset_index()
    entity_type_counts.columns = ["Entity Type", "Count"]
    entity_type_counts["Subfolder"] = subfolder

    # Append to the final summary df
    final_summary = pd.concat([final_summary, entity_type_counts], ignore_index=True)


pivot_summary = final_summary.pivot(index="Entity Type", columns="Subfolder", values="Count").fillna(0)

pivot_output_path = os.path.join(folder, "entity_type_occurrences_summary.xlsx")
pivot_summary.to_excel(pivot_output_path)

print(pivot_summary)

print(f"Entity type occurrences summary exported to {pivot_output_path}!")

Subfolder              dev  test  train
Entity Type                            
GeographicalLocation   992   871   7883
Habitat                 89   153   1258
Person                 180   230   2413
Taxon                 1546  1320   9319
TemporalExpression     157   251   1737
Entity type occurrences summary exported to /content/drive/My Drive/CS_198/copious_published/entity_type_occurrences_summary.xlsx!


## **I-E. Relation Pairings**

For this section, the whole dataset is further processed, where rows with the same chunks but varying entity text and types are merged into one row, increasing the total columns of the dataset while decreasing the rows in the process.

This format makes it easier for annotators reading the chunk of text, as the pair of entities occuring in the said chunk would be easily indentifiable and oughts to bring convenience to the annotation process.

The final resulting dataframes are then exported to their corresponding `.xlsx` files.

In [None]:
folder = "/content/drive/My Drive/CS_198/copious_published/"
subfolders = ["train","dev","test"]

for subfolder in subfolders:
  output_path = os.path.join(folder, f"{subfolder}_entities_summary.xlsx")

  # Load generated .xlsx file
  annotations_path = output_path
  entity_df = pd.read_excel(annotations_path)

  relation_rows = []

  # Iterate through each files in the df
  for file_name in entity_df["File"].unique():
      # Filter per file
      file_entities = entity_df[entity_df["File"] == file_name]

      # Group by chunk
      grouped_chunks = file_entities.groupby("Chunk")

      for chunk, entities_in_chunk in grouped_chunks:
          # Find all combinations of desired entities
          for i, entity_1 in entities_in_chunk.iterrows():
              for j, entity_2 in entities_in_chunk.iterrows():
                  # entity_1 != entity_2
                  if i < j:
                      # Check for the desired pairings
                      if (
                          (entity_1["Entity Type"] == "GeographicalLocation" and entity_2["Entity Type"] == "Taxon")
                          or (entity_1["Entity Type"] == "GeographicalLocation" and entity_2["Entity Type"] == "Habitat")
                      ):
                          # Append the relationship row to the list
                          relation_rows.append({
                              "File": file_name,
                              "Entity 1": entity_1["Entity Text"],
                              "Entity 1 Type": entity_1["Entity Type"],
                              "Entity 2": entity_2["Entity Text"],
                              "Entity 2 Type": entity_2["Entity Type"],
                              "Chunk": chunk
                          })

  # Create a new df for the relations
  relations_df = pd.DataFrame(relation_rows)
  relations_df = relations_df.drop_duplicates()

  # Save to Excel in Drive
  output_path_relations = os.path.join("/content/drive/My Drive/CS_198/copious_published/", f"{subfolder}_relations_summary.xlsx")
  #output_path_relations = os.path.join("/content/drive/My Drive/CS_198/copious_published/", "relations_summary.xlsx")
  relations_df.to_excel(output_path_relations, index=False)

  print(f"Relations exported to {output_path_relations}!")

Relations exported to /content/drive/My Drive/CS_198/copious_published/train_relations_summary.xlsx!
Relations exported to /content/drive/My Drive/CS_198/copious_published/dev_relations_summary.xlsx!
Relations exported to /content/drive/My Drive/CS_198/copious_published/test_relations_summary.xlsx!


## **I.F. Summary of Entity-Pairs**

The code below simply counts the number of desired entity pairs occuring with each division of the dataset. As one would see, the amount of samples seem to be quite small, which might prove to be problematic during model training and fine-tuning.

In [None]:
import pandas as pd
import os

# folder and subfolders paths
folder = "/content/drive/My Drive/CS_198/copious_published/"
subfolders = ["train", "dev", "test"]

summary_counts = []

for subfolder in subfolders:
    # Load xlsx file
    relations_path = os.path.join(folder, f"{subfolder}_relations_summary.xlsx")
    relations_df = pd.read_excel(relations_path)

    # Count Geolocation-Taxon and Geolocation-Habitat relations
    geolocation_taxon_count = relations_df[
        (relations_df["Entity 1 Type"] == "GeographicalLocation") &
        (relations_df["Entity 2 Type"] == "Taxon")
    ].shape[0]

    geolocation_habitat_count = relations_df[
        (relations_df["Entity 1 Type"] == "GeographicalLocation") &
        (relations_df["Entity 2 Type"] == "Habitat")
    ].shape[0]

    # Append counts to the summary structure
    summary_counts.append({
        "Subfolder": subfolder,
        "Geolocation-Taxon": geolocation_taxon_count,
        "Geolocation-Habitat": geolocation_habitat_count
    })

# Convert summary to a df and store as xlsx file
summary_df = pd.DataFrame(summary_counts)
relations_path = os.path.join("/content/drive/My Drive/CS_198/copious_published/", "desired_relations_summary.xlsx")
summary_df.to_excel(relations_path, index=False)

print(summary_df)

  Subfolder  Geolocation-Taxon  Geolocation-Habitat
0     train               2437                  691
1       dev                106                   30
2      test                313                   64


Further discussion of these are in the Connference Paper Draft, where afterwards the researcher has proceeded with relation labeling given that the dataset has been preprocessed in a format that allows annotations.