# Convert LLMs4Subjects dataset to Annif fulltext corpus

In [1]:
import json
import os
from glob import glob

# Assume the directories are set up to the git directory under the home directory of the user
source_base_dir = os.path.expanduser("~/git/llms4subjects/shared-task-datasets/TIBKAT/")
destination_base_dir = os.path.expanduser("~/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/")


def clean_and_combine(input):
    if isinstance(input, list):
        return " ¤ ".join([" ".join(i.split()) for i in input])
    return input


URI_PREFIX = "https://d-nb.info/gnd/"
def format_uris_field(uris):
    return f" ".join([f"<{URI_PREFIX}{uri.replace("gnd:", "")}>" for uri in uris])


# Get source dirs, i.e. all dirs with .jsonld files in the source base directory
source_file_paths = glob(source_base_dir + "**/*.jsonld", recursive=True)
source_dirs = sorted(set([os.path.dirname(path) for path in source_file_paths]))
# print("\n".join(source_dirs))

In [2]:
!find ../shared-task-datasets/TIBKAT/ -name "*.tsv" -type f -delete

In [3]:
# Loop through all sourcedirectories
for source_dir in source_dirs:
    # print(source_dir)
    # Create the destination directory if it doesn't exist
    dest_dir = os.path.dirname(source_dir.replace(source_base_dir, destination_base_dir))
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Open the TSV data-sets files for writing
    dest_filepath = os.path.join(dest_dir, "all.tsv")  # One file for both en and de documents
    print(f"Saving data to file {dest_filepath.replace(os.path.expanduser("~"), "<myhome>")}")
    with open(dest_filepath, 'a', encoding='utf-8') as output_file:

        source_file_paths = glob(source_dir+ "*/*.jsonld")
        for source_file_path in source_file_paths:
            # print(source_file_path)
            try:
                with open(source_file_path, "r") as f:
                    data = json.load(f)["@graph"]

                # Extract subject ids from 'dcterms:subject' field
                for field in data:
                    if "dcterms:subject" in field.keys():
                        if isinstance(field["dcterms:subject"], list):
                            uris = [subj["@id"] for subj in field["dcterms:subject"]]
                        else:
                            uris = [field["dcterms:subject"]["@id"]]
                        # TODO Complain if no subject ids
                    if "title" in field.keys():
                        title = clean_and_combine(field["title"])
                    if "abstract" in field.keys():
                        abstract = clean_and_combine(field["abstract"])

                text = title + " ¤ " + abstract
                # TODO Filter by language

                output = text + '\t' + format_uris_field(uris)
                print(output, file=output_file)
            except TypeError as e:
                print(f"Error processing {source_file_path}: {e}")
                raise

    # break  # Stop after the first dir

Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Article/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Article/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Book/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Book/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Conference/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Conference/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Report/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/all-subjects/data/dev/Report/all.tsv
Saving data to file <myhome>/git/Annif-LLMs4Subjec

In [4]:
!tree -L 5 ../shared-task-datasets/TIBKAT/

[01;34m../shared-task-datasets/TIBKAT/[00m
├── [01;34mall-subjects[00m
│   └── [01;34mdata[00m
│       ├── [01;34mdev[00m
│       │   ├── [01;34mArticle[00m
│       │   │   └── all.tsv
│       │   ├── [01;34mBook[00m
│       │   │   └── all.tsv
│       │   ├── [01;34mConference[00m
│       │   │   └── all.tsv
│       │   ├── [01;34mReport[00m
│       │   │   └── all.tsv
│       │   └── [01;34mThesis[00m
│       │       └── all.tsv
│       └── [01;34mtrain[00m
│           ├── [01;34mArticle[00m
│           │   └── all.tsv
│           ├── [01;34mBook[00m
│           │   └── all.tsv
│           ├── [01;34mConference[00m
│           │   └── all.tsv
│           ├── [01;34mReport[00m
│           │   └── all.tsv
│           └── [01;34mThesis[00m
│               └── all.tsv
└── [01;34mtib-core-subjects[00m
    └── [01;34mdata[00m
        ├── [01;34mdev[00m
        │   ├── [01;34mArticle[00m
        │   │   └── all.tsv
        │   ├── [01;34mBook[00m
    

## Check the number of input and output files/documents

In [5]:
!find ~/git/llms4subjects/shared-task-datasets/TIBKAT/ -type f -name "*.jsonld" | wc -l

144485


In [6]:
!wc -l ~/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/*/*/*/*/all.tsv | tail -n1

   144485 total
