# Convert LLMs4Subjects dataset to Annif fulltext corpus

In [1]:
import json
import os
from glob import glob

# Assume the directories are set up to the git directory under the home directory of the user
source_base_dir = os.path.expanduser("~/git/llms4subjects/shared-task-datasets/TIBKAT/")
destination_base_dir = os.path.expanduser("~/git/Annif-LLMs4Subjects/shared-task-datasets/TIBKAT/")


# Loop through all .jsonld files in the source directory
source_file_paths = glob(source_base_dir+ "**/*.jsonld", recursive=True)
for source_file_path in source_file_paths:

    # Create the destination directory if it doesn't exist
    dest_dir = os.path.dirname(source_file_path.replace(source_base_dir, destination_base_dir))
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    try:
        with open(source_file_path, "r") as f:
            data = json.load(f)

        # Extract subject ids from 'dcterms:subject' field
        for field in data["@graph"]:
            if "dcterms:subject" in field.keys():
                if isinstance(field["dcterms:subject"], list):
                    subjects = [subj["@id"] for subj in field["dcterms:subject"]]
                else:
                    subjects = [field["dcterms:subject"]["@id"]]
            # TODO Complain if no subject ids

        # Save the extracted subject ids to a TSV file
        tsv_fpath = source_file_path.replace(".jsonld", ".tsv").replace(source_base_dir, destination_base_dir)
        with open(tsv_fpath, "wt") as f:
            for subject in subjects:
                print("<" + subject + ">", file=f)

        # Save the title and abstract to a TXT file
        txt_fpath = tsv_fpath.replace(".tsv", ".txt")
        with open(txt_fpath, "wt") as f:
            output = field["title"]
            abstract = field["abstract"]
            if isinstance(abstract, str):
                output += "\n" + abstract
            else:
                for part in abstract:
                    output += "\n" + part  # TODO Detect language of abstract
            print(output, file=f)
    except TypeError as e:
        print(f"Error processing {source_file_path}: {e}")
        print(data)
        break
    # break


In [7]:
!tree -L 6 ~/git/Annif-LLMs4Subjects/shared-task-datasets

[01;34m/home/jmminkin/git/Annif-LLMs4Subjects/shared-task-datasets[00m
└── [01;34mTIBKAT[00m
    ├── [01;34mall-subjects[00m
    │   └── [01;34mdata[00m
    │       ├── [01;34mdev[00m
    │       │   ├── [01;34mArticle[00m
    │       │   │   ├── [01;34mde[00m
    │       │   │   └── [01;34men[00m
    │       │   ├── [01;34mBook[00m
    │       │   │   ├── [01;34mde[00m
    │       │   │   └── [01;34men[00m
    │       │   ├── [01;34mConference[00m
    │       │   │   ├── [01;34mde[00m
    │       │   │   └── [01;34men[00m
    │       │   ├── [01;34mReport[00m
    │       │   │   ├── [01;34mde[00m
    │       │   │   └── [01;34men[00m
    │       │   └── [01;34mThesis[00m
    │       │       ├── [01;34mde[00m
    │       │       └── [01;34men[00m
    │       └── [01;34mtrain[00m
    │           ├── [01;34mArticle[00m
    │           │   ├── [01;34mde[00m
    │           │   └── [01;34men[00m
    │           ├── [01;34mBook[00m
    │    

## Check the number of input and output files

In [15]:
!find ~/git/llms4subjects/shared-task-datasets/ -type f -name "*.jsonld" | wc -l

144485


In [13]:
!find ~/git/Annif-LLMs4Subjects/shared-task-datasets/ -type f -name "*.txt" | wc -l

144485


In [14]:
!find ~/git/Annif-LLMs4Subjects/shared-task-datasets/ -type f -name "*.tsv" | wc -l

144485
