In [None]:
from datasets import disable_caching, load_dataset
import nltk
from utils import get_user_system_assistant_format_messages, convert_qa_to_prompt_answer_format, convert_conversation_to_completion_format
import pandas as pd
from pathlib import Path
nltk.download('punkt')

In [None]:
disable_caching() # Saves a lot of harddisk memory (at the cost of no dataset caching) when dealing with the datasets library

## Prompts to train on

In [None]:
default_system_prompt = """Du bist ein hilfsbereiter medizinischer KI-Assistent für die Notaufnahme."""
distillation_system_prompt = """Du bist ein hilfsbereiter medizinischer KI-Assistent für die Notaufnahme. Im Folgenden findest du Leitlinienpassagen und mein Vorgehen bei der Behandlung.
Deine Aufgabe ist es, zu prüfen, ob mein Vorgehen den Leitlinien gerecht wird.
Beachte, dass die Behandlung in der Notaufnahme in zeitlich begrenztem Umfang stattfand. Vergleiche mein Vorgehen mit der Leitlinienpassage. Beurteile dabei ausschließlich den angegeben Behandlungsschritt.
Falls mein Vorgehen im Kontext der Leitlinienpassage angemessen ist, antworte ausschließlich mit 'Ja'.
Falls mein Vorgehen der Leitlinienpassage widerspricht, beginne deine Antwort mit 'Nein' und erkläre anschließend kurz und bündig, was ich hätte besser machen können.
Falls zu der Leitlinie ein Rote-Hand-Brief vorliegt, antworte ausschließlich mit 'Ja'."""

## Dataset Names

In [None]:
english_dataset_names = ['FreedomIntelligence/Medical-R1-Distill-Data', 'FreedomIntelligence/medical-o1-reasoning-SFT', 'Laurent1/MedQuad-MedicalQnADataset_128tokens_max', 'lurosenb/medqa', 'qiaojin/PubMedQA', 'openlifescienceai/medmcqa', 'TIGER-Lab/MMLU-Pro'] # we don't use english datsets since we were not happy with the translation quality of free applications/models
german_dataset_names = ['CausalLM/GPT-4-Self-Instruct-German', 'BioMistral/BioInstructQA', 'avemio/German-RAG-ORPO-Alpaca-HESSIAN-AI', 'amphora/Open-R1-Mulitlingual-SFT']

## German Dataset Unification

In [None]:
ger_data = load_dataset('CausalLM/GPT-4-Self-Instruct-German')

In [None]:
ger_data = ger_data['train']

In [None]:
def bring_prompt_into_final_train_format(instruct:str,expected_response:str, system_prompt:str=default_system_prompt):
    """

    :param instruct:
    :param expected_response:
    :param system_prompt:
    :return:
    """
    messages = get_user_system_assistant_format_messages(instruct, system_prompt, expected_response)
    return messages

In [None]:
def apply_to_self_instruct_german(x):
    """

    :param x:
    :return:
    """
    x['conversation'] = bring_prompt_into_final_train_format(x['instruction'], x['output'])
    return x

In [None]:
ger_data = ger_data.map(apply_to_self_instruct_german)

In [None]:
ger_data.to_pandas().to_feather(r'data\distillation_training/auxiliary_datasets/gpt_4_self_instruct_german.feather')

### Load & Preprocess Dataset 2

In [None]:
import json
# you have to download and extract this json file from https://huggingface.co/datasets/BioMistral/BioInstructQA/blob/main/data.zip
with open(r'data\distillation_training\auxiliary_datasets\German-full.json', encoding='utf-8') as f:
    ger_data2 = json.loads(f.read())

In [None]:
df_ger2 = pd.DataFrame(ger_data2)

In [None]:
df_ger2['conversation'] = df_ger2.apply(
    lambda row: (convert_qa_to_prompt_answer_format(
        question=row['question_translated'],
        answers=row['options_translated'],
        correct_answer_letter=row['correct_answer_letter'])),
    axis=1
)


In [None]:
df_ger2 = df_ger2.explode('conversation')

In [None]:
df_ger2[['conversation', 'question_translated', 'correct_answer_text_translated']].to_feather(r'data\distillation_training/auxiliary_datasets/german_medqa.feather')

### Load & Preprocess Dataset 3

In [None]:
topics = ['hard-reasoning-de', 'SauerkrautLM-Fermented-GER-DPO', 'SauerkrautLM-Fermented-Irrelevance-GER-DPO', 'qa-meeting-attendee-topic', 'qa-meeting-topic', 'hard-qa-with-multiple-references']

In [None]:
df_ger_3 = pd.concat([load_dataset('avemio/German-RAG-ORPO-Alpaca-HESSIAN-AI', topic)['train'].to_pandas() for topic in topics]).reset_index(drop=True).dropna()

In [None]:
df_ger_3['conversation'] = df_ger_3.apply(lambda x: bring_prompt_into_final_train_format(x['Instruction'], x['Chosen'], x['System']), axis=1)

In [None]:
df_ger_3.to_feather(r'data\distillation_training/auxiliary_datasets/avemio.feather')

### Load & Preprocess Dataset 4

In [None]:
df_ger_4 = pd.read_feather(r'data\distillation_training/auxiliary_datasets/german_sft_r1.feather')

In [None]:
df_ger_4['conversation'] = df_ger_4.apply(lambda x: bring_prompt_into_final_train_format(x['prompt'], x['response']), axis=1)

### Load & Preprocess Dataset 5

In [None]:
df_ger_5 = pd.concat([pd.read_parquet(path) for path in Path(r'data\distillation_training\auxiliary_datasets\mmlu_de_medical').glob('*.parquet')])

In [None]:
df_ger_5['conversation'] = df_ger_5.apply(
    lambda row: (convert_qa_to_prompt_answer_format(
        question=row['question_de'],
        answers={num: choice for num, choice in enumerate(row['choices_de'])},
        correct_answer_letter=row['answer'])),
    axis=1)


In [None]:
df_ger_5 = df_ger_5.explode('conversation')

### Load & Preprocess Dataset 6

In [None]:
df_ger_6 = pd.concat([pd.read_parquet(path) for path in Path(r'data\distillation_training\auxiliary_datasets\mmlu_de_other').glob('*.parquet')])

In [None]:
df_ger_6['conversation'] = df_ger_6.apply(
    lambda row: (convert_qa_to_prompt_answer_format(
        question=row['question_de'],
        answers={num: choice for num, choice in enumerate(row['choices_de'])},
        correct_answer_letter=row['answer'])),
    axis=1)

df_ger_6 = df_ger_6.explode('conversation')

In [None]:
df_ger = ger_data.to_pandas()

### Add metadata & Combine Datasets

In [None]:
df_ger['source'] = german_dataset_names[0]
df_ger2['source'] = german_dataset_names[1]
df_ger_3['source'] = german_dataset_names[2]
df_ger_4['source'] = german_dataset_names[3]
df_ger_5['source'] = 'MMLU_de_Medical'
df_ger_6['source'] = 'MMLU_de_Other'

In [None]:
df_final = pd.concat([df_ger[['source', 'conversation']], df_ger2[['source', 'conversation']], df_ger_3[['source', 'conversation']],df_ger_4[['source', 'conversation']], df_ger_5[['source', 'conversation']], df_ger_6[['source', 'conversation']]])

### Analysis of final dataset

In [None]:
df_final.shape

In [None]:
df_final['source'].value_counts()

In [None]:
df_final['training_data_type'] = df_final['source'].apply(lambda x: 'medical' if x in ['BioMistral/BioInstructQA', 'MMLU_de_Medical'] else 'other')

In [None]:
df_final['training_data_type'].value_counts()

## Convert Dataset to Completion format

In [None]:
df_final = convert_conversation_to_completion_format(df_final)

In [None]:
df_final.to_feather(r'data\distillation_training\auxiliary_datasets\all_german_data_combined_completion_format.feather')