[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RonPlusSign/llms4subjects/blob/main/embedding_similarity_tagging.ipynb)

# Embedding Similarity Tagging

The goal of this notebook is to run the `embedding_similarity_tagging.py` script with different parameters (e.g. different embedding models).

The script uses a SentenceTransformer model to encode document texts and tag embeddings,
and then computes the similarity between them to tag the documents with the most similar GND tags.

The quality of the tagging results is evaluated using the `shared-task-eval-script/llms4subjects-evaluation.py` script.

In [None]:
# If you run this notebook in Google Colab, run this

# Clone repository and move its content in the current directory
!git clone https://github.com/RonPlusSign/llms4subjects.git
!mv llms4subjects/* .
!rm -r llms4subjects

# Install required packages
!pip install -r requirements.txt

#### Tagging with Different Embedding Models

In [None]:
models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "distiluse-base-multilingual-cased-v1",
    "T-Systems-onsite/cross-en-de-roberta-sentence-transformer", # this gives warning "No sentence-transformers model found with name ...", but it's ok
    "intfloat/multilingual-e5-large",
]

In [None]:
for model_name in models:

    model_name_folder = model_name.split("/")[-1]
    tag_embeddings_file = f"results/{model_name_folder}/tag_embeddings.json" # Where to save the tag embeddings
    results_dir = f"results/{model_name_folder}" # Where to save the tagging results
    docs_path = "shared-task-datasets/TIBKAT/tib-core-subjects/data/dev" # Documents to tag
    tag_file = "shared-task-datasets/GND/dataset/GND-Subjects-tib-core.json" # Tag list definition

    print(f"\n------Running tagging with model: {model_name} ------")
    %run embedding_similarity_tagging.py \
            --model_name {model_name} \
            --tags_file {tag_file} \
            --tag_embeddings_file {tag_embeddings_file} \
            --results_dir {results_dir} \
            --docs_path {docs_path}

#### Evaluation

In [None]:
# Evaluate the tagging results using the evaluation script.
for model_name in models:
    print(f"\n------Evaluating tagging results for model: {model_name} ------")

    model_name_folder = model_name.split("/")[-1]
    true_labels_dir = "shared-task-datasets/TIBKAT/tib-core-subjects/data/dev"
    pred_labels_dir = f"results/{model_name_folder}/dev"
    results_dir = f"results/{model_name_folder}"

    %run "shared-task-eval-script/llms4subjects-evaluation.py" \
            --team_name {model_name_folder} \
            --true_labels_dir {true_labels_dir} \
            --pred_labels_dir {pred_labels_dir} \
            --results_dir {results_dir}

## SentenceTransformer fine-tuning

The `finetune_sentence_transformer.py` script fine-tunes a SentenceTransformer model on training data for subject tagging.

In [None]:
# Finetune all SentenceTransformer models on the training data
for model_name in models:
    
    print(f"\n------Fine-tuning model: {model_name} ------")

    model_name_clean = model_name.split("/")[-1]
    training_data_dir = "shared-task-datasets/TIBKAT/tib-core-subjects/data/train"
    eval_data_dir = "shared-task-datasets/TIBKAT/tib-core-subjects/data/dev"
    gnd_tags_file = "shared-task-datasets/GND/dataset/GND-Subjects-tib-core.json"

    for loss in losses:
        # Modify the output path to include the loss function name
        output_model_path = f"models/finetuned/{model_name_clean}_{loss}"
        print(f"Using loss: {loss}, saving to: {output_model_path}")

        # Run the fine-tuning script with the specified loss
        %run finetune_sentence_transformer.py \
                --training_path { training_data_dir } \
                --eval_path { eval_data_dir } \
                --gnd_tags_file { gnd_tags_file } \
                --model_name { model_name } \
                --output_model_path { output_model_path } \
                --batch_size 16 \
                --num_epochs 1 \
                --loss { loss }

model_names = [model_name.split("/")[-1] for model_name in models]
finetuned_models_path = [f"models/finetuned/{model_name}" for model_name in model_names]

for model_name in finetuned_models_path:
    model_name_folder = model_name.split("/")[-1]
    tag_embeddings_file = f"results/finetuned_{model_name_folder}/tag_embeddings.json"  # Where to save the tag embeddings
    results_dir = f"results/finetuned_{model_name_folder}"  # Where to save the tagging results
    docs_path = "shared-task-datasets/TIBKAT/tib-core-subjects/data/dev"  # Documents to tag
    tag_file = "shared-task-datasets/GND/dataset/GND-Subjects-tib-core.json"  # Tag list definition

    print(f"\n------Running tagging with model: {model_name} ------")
    %run embedding_similarity_tagging.py \
            --model_name { model_name } \
            --tags_file { tag_file } \
            --tag_embeddings_file { tag_embeddings_file } \
            --results_dir { results_dir } \
            --docs_path { docs_path }

#### Evaluate the fine-tuned models

In [None]:
model_name = "all-MiniLM-L6-v2"

model_name_clean = model_name.split("/")[-1]
model_path = f"{model_name_clean}"
training_data_dir = "shared-task-datasets/TIBKAT/tib-core-subjects/data/train"
eval_data_dir = "shared-task-datasets/TIBKAT/tib-core-subjects/data/dev"
gnd_tags_file = "shared-task-datasets/GND/dataset/GND-Subjects-tib-core.json"

%run "binary_mlp.py" \
        --training_path {training_data_dir} \
        --eval_path {eval_data_dir} \
        --gnd_tags_file {gnd_tags_file} \
        --model_name {model_path} \
        --batch_size 16 \
        --num_epochs 5 