<a href="https://colab.research.google.com/github/Adria100/clin_IQ/blob/main/1_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SOME IMPORTS

In [None]:
!pip install spacy
!pip install datasets
!pip install torch
!pip install transformers
!python -m spacy download en_core_web_sm
!pip install transformers accelerate bitsandbytes

In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets
import json
from requests.exceptions import RequestException
import time
import re
import pandas as pd
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_id = "codellama/CodeLlama-7b-Instruct-hf"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

#check dataset structure

In [None]:
def check_dataset_structure(x):
    try:
        dataset = load_dataset(x)

        # Print the names of the splits
        print("Dataset splits:", dataset.keys())

        # Print number of samples in each split
        for split in dataset.keys():
            print(f"{split} size: {len(dataset[split])}")

        # Print column names (structure)
        print("Columns:", dataset["train"].column_names)

    except Exception as e:
        print(f"Unexpected error: {e}")



# DATA from GITHUB and mostly HUGGINGFACE

In [None]:
def transform_MC1_dataset():
    try:
        # Load dataset from Hugging Face
        dataset = load_dataset("bigbio/med_qa")
        transformed_data_MC1 = []
        for item in concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]]):
            # Ensure only English questions are kept
            #if item["language"] == "english":
                transformed_item = {
                    "correct_answer": item["answer_idx"],  # Convert index to A/B/C/D format
                    "options": {  # Extract only the values from option dictionary
                        "A": item["options"][0]["value"],
                        "B": item["options"][1]["value"],
                        "C": item["options"][2]["value"],
                        "D": item["options"][3]["value"]
                    },
                    "question": item["question"],
                    "source": {
                        "isbn": "000-0000000000",
                        "page": 0,
                        "paragraph_id": "000-0000000000-p00-para00"
                    },
                    "type": "multiple_choice"
                }
                transformed_data_MC1.append(transformed_item)
        return transformed_data_MC1
    except Exception as e:
        print(f"Unexpected error: {e}")
transformed_MC1_data = transform_MC1_dataset()
print(json.dumps(transformed_MC1_data[:3], indent=4))

In [None]:
def transform_short_answer1_dataset():
    dataset = load_dataset("HPAI-BSC/OpenMedQA")
    transformed_data_short_answer1 = []
    for item in dataset['train']:  # Assuming 'train' split contains the data
        transformed_item = {
            "answer": item["answer"],
            "question": item["question"],
            "source": {
                "isbn": "000-0000000000",  # Placeholder value
                "page": 0,  # Placeholder value
                "paragraph_id": "000-0000000000-p00-para26"  # Placeholder value
            },
            "type": "short_answer"
        }
        transformed_data_short_answer1.append(transformed_item)
    return transformed_data_short_answer1

transformed_short_answer1_data = transform_short_answer1_dataset()
print(json.dumps(transformed_short_answer1_data[:5], indent=4))

In [None]:
dataset = load_dataset("qiaojin/PubMedQA", "pqa_artificial")
transformed_data_TF2 = [] # Initialize an empty list for True/False questions
for entry in dataset["train"]:
    question = entry['question'].strip()
    answer = entry['final_decision'].strip()
    # Convert final_decision to True/False
    transformed_answer = "True" if answer.lower() == "yes" else "False"
    # Create the formatted True/False entry
    formatted_entry = {
        "answer": transformed_answer,
        "question": question,
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para31"
        },
        "type": "true_false"
    }

    transformed_data_TF2.append(formatted_entry)
transformed_TF2_data = transformed_data_TF2
# Print the first 3 formatted entries
print(json.dumps(transformed_TF2_data[:3], indent=4))

llama

In [None]:
from joblib import Memory
from tqdm.auto import tqdm
import nltk

# Download necessary NLTK data if not already downloaded
nltk.download('punkt')

# Set pad_token_id to eos_token_id for open-end generation
tokenizer.pad_token_id = tokenizer.eos_token_id

# Initialize caching
memory = Memory(location=".cache", verbose=0)

# Function to generate prompt
def generate_prompt(example):
    return f"""
    Question: {example['question']}
    Answer: {example['answer']}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """

# Function to extract reasoning and answer
def extract_reasoning(response):
    generated_text = response[0]["generated_text"]
    sentences = nltk.sent_tokenize(generated_text)
    answer = sentences[-1]
    reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]
    return reasoning, answer

# Function to generate reasoning steps (with caching)
@memory.cache
def generate_reasoning_steps(examples):
    prompts = [generate_prompt(example) for example in examples]
    reasoning_steps = []
    answers = []
    for prompt in prompts:
        try:
            pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
            response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)
            reasoning, answer = extract_reasoning(response)
            reasoning_steps.append(reasoning)
            answers.append(answer)
        except Exception as e:
            print(f"Error during llama_pipeline call: {e}")
            reasoning_steps.append(["Error: Could not generate reasoning."])
            answers.append("Error: Could not generate answer.")
    return {"reasoning": reasoning_steps, "answer": answers}

# Load the dataset
dataset = load_dataset("lavita/MedQuAD", split="train")

# Apply the function to the dataset
dataset = dataset.map(generate_reasoning_steps, batched=True, batch_size=4)

# Transform the dataset to the desired format
transformed_data_R1 = []
for item in tqdm(dataset, desc="Transforming data"):
    formatted_item = {
        "answer": item["answer"],
        "question": item["question"],
        "reasoning": item["reasoning"],
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
        "type": "multi_hop"
    }
    transformed_data_R1.append(formatted_item)

# Print the first 3 formatted entries
print(json.dumps(transformed_data_R1[:3], indent=4))
transformed_R1_data = transformed_data_R1

In [None]:
def transform_MC2_dataset():
    try:
        # Load both test and validation splits
        dataset_test = load_dataset("stellalisy/mediQ",split="test")
        dataset_validation = load_dataset("stellalisy/mediQ", split="validation")

        transformed_data_MC2 = []

        # Process both splits
        for dataset in [dataset_test, dataset_validation]:
            for item in dataset:
                context = item.get("context", "")
                context = re.sub(r"[\[\]\{\}\(\)\'\"]", "", str(context)) # Remove other brackets and quotes
                transformed_item = {
                    "correct_answer": item["answer_idx"],
                    "options": item["options"],
                    "question": item["question"] + " " + context,
                    "source": {
                        "isbn": "000-0000000000",
                        "page": 0,
                        "paragraph_id": "000-0000000000-p00-para00"
                    },
                    "type": "multiple_choice"
                }
                transformed_data_MC2.append(transformed_item)

        # Return the combined transformed data
        print(json.dumps(transformed_data_MC2[:3], indent=4))
        return transformed_data_MC2

    except Exception as e:
        print(f"Unexpected error: {e}")

# Call the function and get the length
transformed_MC2_data = transform_MC2_dataset()

In [None]:
def transform_MC3_dataset():
    try:
        # Load dataset from Hugging Face
        dataset = load_dataset("openlifescienceai/medmcqa")  # Loads the train split directly

        transformed_data_MC3 = []

        for item in concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]]):  # Iterate directly over dataset
            # Map numerical index to letter option
            answer_mapping = {0: "A", 1: "B", 2: "C", 3: "D"}
            correct_answer = answer_mapping.get(item["cop"], None)  # Get letter option or None if not found

            transformed_item = {
                "correct_answer": correct_answer, # Use mapped answer
                "options": {
                    "A": item["opa"],
                    "B": item["opb"],
                    "C": item["opc"],
                    "D": item["opd"]
                },
                "question": item["question"],
                "source": {
                    "isbn": "000-0000000000",
                    "page": 0,
                    "paragraph_id": "000-0000000000-p00-para00"
                },
                "type": "multiple_choice"
            }
            transformed_data_MC3.append(transformed_item)
        return transformed_data_MC3

    except RequestException as e:
        print(f"Error loading dataset: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Now you can use transformed_MC3_data as before
transformed_MC3_data = transform_MC3_dataset()
print(json.dumps(transformed_MC3_data[:10], indent=4)) # Example: print first 10 entries

In [None]:
# Number of retry attempts
MAX_RETRIES = 3

# Attempt to load the dataset with retry logic
for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("UCSC-VLAA/MedReason")  # Replace with your dataset name
        break  # Exit the loop if successful
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)  # Wait before retrying
        else:
            raise  # Re-raise the exception if all retries fail

transformed_data_MC4 = []

# Process each entry in the dataset
for entry in dataset['train']:
    question = entry['question'].strip()  # Assume the question is stored in the 'question' column
    #answer = entry['answer'].strip()  # Assume the answer is in the 'answer' column - not needed here
    options_raw = entry['options'].strip()  # Assume the options are in the 'options' column

    # Extract and format options
    options = {}
    for line in options_raw.split("\n"):
        if line.strip() and ". " in line:  # Check if the line is not empty and contains ". "
            choice, text = line.split(". ", 1)  # Split into choice and text
            options[choice.strip()] = text.strip()

    # Extract answer text (using string manipulation or regex)
    answer_text = entry['answer'].strip().split(".")[0]  # Split at the first "." and take the first part

    # Find the correct answer letter (using word-based matching)
    correct_answer_letter = None
    for letter, option_text in options.items():
        for word in answer_text.split():  # Iterate through words in the answer
            if word in option_text:  # Check if the word is present in the option text
                correct_answer_letter = letter
                break  # Stop searching if a match is found
        if correct_answer_letter:  # Stop searching options if a match is found
            break

    correct_answer = correct_answer_letter

    # Define source information (using placeholders)
    source = {
        "isbn": "000-0000000000",
        "page": 0,
        "paragraph_id": "000-0000000000-p00-para06"
    }

    # Construct the formatted entry
    formatted_entry = {
        "correct_answer": correct_answer,  # Use the found letter
        "options": options,  # Use the formatted options dictionary
        "question": question,
        "source": source,
        "type": "multiple_choice"
    }
    transformed_data_MC4.append(formatted_entry)

# Print first 3 formatted entries for verification (optional)
print(json.dumps(transformed_data_MC4[:3], indent=4))

transformed_MC4_data = transformed_data_MC4

no llama used, reasoning in the dataset

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm
import re

# Load the dataset
dataset = load_dataset("UCSC-VLAA/MedReason", split="train")

# Function to extract reasoning and answer from the "reasoning" column
def extract_reasoning_and_answer(example):
    reasoning_text = example["reasoning"]

    # Split into sections using regex
    sections = re.split(r"(Finding reasoning paths:|Reasoning Process:|Conclusion:)", reasoning_text)

    # Extract relevant parts
    reasoning_process = sections[4].strip() if len(sections) > 4 else ""
    conclusion = sections[6].strip() if len(sections) > 6 else ""

    # Combine reasoning paths and process into steps, starting from 1
    reasoning_steps = []
    step_counter = 1  # Initialize step counter

    if reasoning_process:
        for line in reasoning_process.split('\n'):
              if line.strip():  # Check if line is not empty
                    reasoning_steps.append(f"Step {step_counter}: {line.strip()}")
                    step_counter += 1  # Increment step counter

        # Extract the answer from the conclusion
        answer = conclusion.split('.')[-2].strip() if conclusion else ""  # Last sentence before trailing period

    return {"reasoning": reasoning_steps, "answer": answer}

# Apply the function to the dataset
dataset = dataset.map(extract_reasoning_and_answer)

# Transform the dataset to the desired format
transformed_data_R2 = []
for item in tqdm(dataset, desc="Transforming data"):
    formatted_item = {
        "answer": item["answer"],
        "question": item["question"],
        "reasoning": item["reasoning"],
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000-p00-para01"  # You can adjust the paragraph_id as needed
        },
        "type": "multi_hop"
    }
    transformed_data_R2.append(formatted_item)

# Print the first 3 formatted entries
print(json.dumps(transformed_data_R2[:3], indent=4))
transformed_R2_data = transformed_data_R2

llama

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm
import re
import nltk

# Download necessary NLTK data if not already downloaded
nltk.download('punkt')

# Set pad_token_id to eos_token_id for open-end generation
tokenizer.pad_token_id = tokenizer.eos_token_id

# Function to generate prompt
def generate_prompt(example):
    return f"""
    Question: {example['question']}
    Answer: {example['answer']}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """

# Function to extract reasoning and answer
def extract_reasoning(response):
    generated_text = response[0]["generated_text"]
    sentences = nltk.sent_tokenize(generated_text)
    answer = sentences[-1]  # Last sentence is the answer
    reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]
    return reasoning, answer

# Function to generate reasoning steps
def generate_reasoning_steps(examples):
    prompts = [generate_prompt(example) for example in examples]
    reasoning_steps = []
    answers = []
    for prompt in prompts:
        try:
            pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
            response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)
            reasoning, answer = extract_reasoning(response)
            reasoning_steps.append(reasoning)
            answers.append(answer)
        except Exception as e:
            print(f"Error during llama_pipeline call: {e}")
            reasoning_steps.append(["Error: Could not generate reasoning."])
            answers.append("Error: Could not generate answer.")
    return {"reasoning": reasoning_steps, "answer": answers}

# Load the dataset
dataset = load_dataset("YvanAlvin/medicalQALlama2", split="train")

# Extract question and answer from the "text" column
def extract_question_answer(example):
    text = example["text"]
    match = re.search(r'\[INST\](.*?)\[/INST\]', text, re.DOTALL)
    question = match.group(1).strip() if match else "Unknown Question"
    answer = text.split("[/INST]")[-1].strip()
    return {"question": question, "answer": answer}

dataset = dataset.map(extract_question_answer)

# Apply the reasoning generation function
dataset = dataset.map(generate_reasoning_steps, batched=True, batch_size=4)

# Transform the dataset to the desired format
transformed_data_R3 = []
for item in tqdm(dataset, desc="Transforming data"):
    formatted_item = {
        "answer": item["answer"],
        "question": item["question"],
        "reasoning": item["reasoning"],
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
        "type": "multi_hop"
    }
    transformed_data_R3.append(formatted_item)

# Print the first 3 formatted entries
print(json.dumps(transformed_data_R3[:3], indent=4))
transformed_R3_data = transformed_data_R3

llama

In [None]:
# Load spaCy model for sentence segmentation
import spacy
nlp = spacy.load("en_core_web_sm")

# Number of retry attempts
MAX_RETRIES = 3

# Attempt to load the dataset with retry logic
for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("YvanAlvin/medicalchat200llama2", split="train")
        break  # Exit the loop if successful
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)  # Wait for 5 seconds before retrying
        else:
            raise  # Re-raise the exception if all retries fail

transformed_data_R4 = []

# Process each item in the dataset
for item in dataset:
    text = item["text"]

    # Extract the question from inside [INST]...[/INST]
    match = re.search(r'\[INST\](.*?)\[/INST\]', text, re.DOTALL)
    question = match.group(1).strip() if match else "Unknown Question"

    # Extract the answer as everything after [/INST]
    answer = text.split("[/INST]")[-1].strip()

    # Generate reasoning using CodeLlama
    prompt = f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)

        generated_text = response[0]["generated_text"]
        sentences = nltk.sent_tokenize(generated_text)
        final_answer = sentences[-1]
        reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]

    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."

    formatted_item = {
        "answer": final_answer,
        "question": question,
        "reasoning": reasoning,
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
        "type": "multi_hop"
    }

    transformed_data_R4.append(formatted_item)

transformed_R4_data = transformed_data_R4

llama

In [None]:
# Anzahl der Wiederholungsversuche für das Laden des Datensatzes
MAX_RETRIES = 3

# Dataset laden mit Wiederholungslogik
for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("eashuu/medical_qa", split="train")
        break  # Exit the loop if successful
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)  # Wait for 5 seconds before retrying
        else:
            raise  # Re-raise the exception if all retries fail

# Function to generate prompt
def generate_prompt(question, answer):
    return f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """

# Function to extract reasoning and answer
def extract_reasoning(response):
    generated_text = response[0]["generated_text"]
    sentences = nltk.sent_tokenize(generated_text)
    answer = sentences[-1]  # Last sentence is the answer
    reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]
    return reasoning, answer

# Function to generate reasoning steps
def generate_reasoning_steps(question, answer):
    prompt = generate_prompt(question, answer)
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)
        reasoning, final_answer = extract_reasoning(response)
    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."
    return reasoning, final_answer

# Modify processing logic
transformed_data_R5 = []

for item in dataset:
    question = item["instruction"].replace("Q. ", "", 1).strip()
    answer = item["output"].strip()

    # CodeLlama for argumentation verwenden
    reasoning, final_answer = generate_reasoning_steps(question, answer)

    formatted_item = {
        "answer": final_answer,
        "question": question,
        "reasoning": reasoning,
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
        "type": "multi_hop"
    }

    transformed_data_R5.append(formatted_item)
transformed_R5_data = transformed_data_R5

llama

In [None]:
# Number of retry attempts
MAX_RETRIES = 3

for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")
        break  # Exit the loop if successful
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)  # Wait for 5 seconds before retrying
        else:
            raise  # Re-raise the exception if all retries fail

transformed_data_R6 = []

# Process each item in the dataset
for item in dataset["train"]:
    question = item["Question"].strip()
    answer = item["Response"].strip()

    # Generate reasoning using CodeLlama
    prompt = f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)

        generated_text = response[0]["generated_text"]
        sentences = nltk.sent_tokenize(generated_text)
        final_answer = sentences[-1]
        reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]

    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."

    formatted_item = {
        "question": question,
        "reasoning": reasoning,
        "answer": final_answer,
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
        "type": "multi_hop"
    }

    transformed_data_R6.append(formatted_item)

# Print the first 3 formatted entries
#print(transformed_data_R6[:3])

transformed_R6_data = transformed_data_R6

llama

In [None]:
# Anzahl der Wiederholungsversuche für das Laden des Datensatzes
MAX_RETRIES = 3

# Dataset laden mit Wiederholungslogik
for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("wentechno/medicalQA-50thPlus", split="train")
        break  # Exit the loop if successful
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)  # Wait for 5 seconds before retrying
        else:
            raise  # Re-raise the exception if all retries fail

# Function to generate prompt
def generate_prompt(question, answer):
    return f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """

# Function to extract reasoning and answer
def extract_reasoning(response):
    generated_text = response[0]["generated_text"]
    sentences = nltk.sent_tokenize(generated_text)
    answer = sentences[-1]  # Last sentence is the answer
    reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]
    return reasoning, answer

# Function to generate reasoning steps
def generate_reasoning_steps(question, answer):
    prompt = generate_prompt(question, answer)
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)
        reasoning, final_answer = extract_reasoning(response)
    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."
    return reasoning, final_answer

# Modify processing logic
transformed_data_R7 = []

for item in dataset:
    question = item["instruction"].replace("Q. ", "", 1).strip()
    answer = item["output"].strip()

    # CodeLlama für Argumentation verwenden
    reasoning, final_answer = generate_reasoning_steps(question, answer)

    formatted_item = {
        "answer": final_answer,  # Use the extracted final answer
        "question": question,
        "reasoning": reasoning,  # Use the extracted reasoning steps
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
        "type": "multi_hop"
    }

    transformed_data_R7.append(formatted_item)
transformed_R7_data = transformed_data_R7

In [None]:
# Load the dataset
dataset = load_dataset("Ajayaadhi/Medical-QA")

# Initialize a list to hold the reformatted entries
transformed_data_short_answer2 = []

for entry in dataset["train"]:  # Process all entries
    text = entry["train"]  # Adjust this if the key is different

    # Extract question
    question_match = re.search(r"### Input:\n(.+?)\n\[INST\]", text, re.DOTALL)
    question = question_match.group(1).strip() if question_match else ""

    # Extract answer
    answer_match = re.search(r"### Response:\n(.+?)</s>", text, re.DOTALL)
    answer = answer_match.group(1).strip() if answer_match else ""

    # Define source information (using placeholders)
    source = {
        "isbn": "000-0000000000",
        "page": 0,
        "paragraph_id": "000-0000000-p00-para01"
    }

    # Determine response type
    response_type = "short_answer"

    # Construct the reformatted entry
    reformatted_entry = {
        "question": question,
        "answer": answer,
        "source": source,
        "type": response_type
    }

    # Append to the list
    transformed_data_short_answer2.append(reformatted_entry)

# Print the first 3 formatted entries
transformed_short_answer2_data = transformed_data_short_answer2
print(json.dumps(transformed_short_answer1_data[:4], indent = 4))

llama

In [None]:
# Anzahl der Wiederholungsversuche für das Laden des Datensatzes
MAX_RETRIES = 3

# Dataset laden mit Wiederholungslogik
for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("KaungHtetCho/MedicalQA", split="train")
        break  # Exit the loop if successful
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)  # Wait for 5 seconds before retrying
        else:
            raise  # Re-raise the exception if all retries fail


# Funktion zur Generierung von Argumentationsschritten
def generate_reasoning_steps(question, answer):
    prompt = f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4) # Using the pipeline
        # Extract reasoning and final answer
        generated_text = response[0]["generated_text"]
        sentences = nltk.sent_tokenize(generated_text)
        final_answer = sentences[-1]  # Last sentence is the final answer
        reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]

    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."

    return reasoning, final_answer

transformed_data_R8 = []

# Process each entry in the dataset
for entry in dataset:
    patient = entry['Patient'].strip()
    doctor = entry['Doctor'].strip()

    # Form the question from description + patient details
    question = patient

    # Generate answer using CodeLlama
    # answer = llama_pipeline(question, max_length=256, do_sample=True)[0]["generated_text"].strip()
    answer = doctor
    # Generate reasoning using CodeLlama
    reasoning, final_answer = generate_reasoning_steps(question, answer)


    # Define source information (using placeholders)
    source = {
        "isbn": "000-0000000000",
        "page": 0,
        "paragraph_id": "000-0000-p00-para01"
    }

    # Define the type based on reasoning complexity
    response_type = "multi_hop"

    # Construct the formatted entry
    formatted_entry = {
        "question": question,
        "reasoning": reasoning,
        "answer": final_answer,  # Using generated answer
        "source": source,
        "type": response_type
    }

    transformed_data_R8.append(formatted_entry)
transformed_R8_data = transformed_data_R8
# Print first 3 formatted entries
print(json.dumps(transformed_data_R8[:3], indent=4))

In [None]:
# Anzahl der Wiederholungsversuche für das Laden des Datensatzes
MAX_RETRIES = 3

# Dataset laden mit Wiederholungslogik
for attempt in range(MAX_RETRIES):
    try:
        dataset = load_dataset("qiaojin/PubMedQA", "pqa_unlabeled")
        break
    except RequestException:
        if attempt < MAX_RETRIES - 1:
            print(f"Download attempt {attempt + 1} failed. Retrying...")
            time.sleep(5)
        else:
            raise

# Funktion zur Generierung von Argumentationsschritten
def generate_reasoning_steps(question, answer):
    prompt = f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4) # Using the pipeline
        # Extract reasoning and final answer
        generated_text = response[0]["generated_text"]
        sentences = nltk.sent_tokenize(generated_text)
        final_answer = sentences[-1]  # Last sentence is the final answer
        reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]

    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."

    return reasoning, final_answer

transformed_data_R9 = []

# Process each entry in the dataset
for entry in dataset["train"]:
    question = entry['question'].strip()
    answer = entry['long_answer'].strip()

    # Use CodeLlama to generate reasoning
    reasoning, final_answer = generate_reasoning_steps(question, answer)  # Extract final_answer

    # Define source information (using placeholders)
    source = {
        "isbn": "000-0000000000",
        "page": 0,
        "paragraph_id": "000-0000-p00-para01"
    }

    # Define the type based on reasoning complexity
    response_type = "multi_hop"

    # Construct the formatted entry
    formatted_entry = {
        "question": question,
        "reasoning": reasoning,
        "answer": final_answer,  # Use extracted final_answer
        "source": source,
        "type": response_type
    }

    transformed_data_R9.append(formatted_entry)

transformed_R9_data = transformed_data_R9
# Print first 3 formatted entries
#print(json.dumps(transformed_data_R9[:3], indent=4))

#DATA FROM KAGGLE

In [None]:
!pip install kaggle pandas


In [None]:
!kaggle datasets download -d thedevastator/comprehensive-medical-q-a-dataset --unzip


In [None]:
import os
from google.colab import auth
from kaggle.api.kaggle_api_extended import KaggleApi
# Authenticate with Kaggle API
os.environ['KAGGLE_USERNAME'] = "apfresh" # Replace with your username
os.environ['KAGGLE_KEY'] = "50af00b12093dc762e1d2d1c138dd817"
api = KaggleApi()
api.authenticate()

In [None]:
# Load the dataset (assuming it's already available in Colab)
file_path = "/content/train.csv"  # Adjust based on actual filename
dataset = pd.read_csv(file_path)

# Initialize a list to hold the reformatted entries
transformed_data_short_answer3 = []

# Iterate over the dataset entries
for index, entry in dataset.iterrows():
    question = entry.get('Question', '').strip()
    answer = entry.get('Answer', '').strip()

    # Define source information (using placeholders here)
    source = {
        "isbn": "000-0000000000",
        "page": 0,
        "paragraph_id": f"000-0000000-p00-para{index+1:02d}"
    }

    # Determine the response type based on answer length
    response_type = "short_answer"

    # Construct the reformatted entry
    reformatted_entry = {
        "question": question,
        "answer": answer,
        "source": source,
        "type": response_type
    }

    # Append to the list
    transformed_data_short_answer3.append(reformatted_entry)
# Convert the list to JSON format and print the first 3 entries
#print(json.dumps(transformed_data_short_answer3[:3], indent=4))
transformed_short_answer3_data = transformed_data_short_answer3
print(json.dumps(transformed_short_answer3_data[:4], indent=4))

In [None]:
!kaggle datasets download -d pythonafroz/medquad-medical-question-answer-for-ai-research --unzip


In [None]:
import pandas as pd
import nltk
from transformers import pipeline

# Assuming you have already loaded the model and tokenizer as 'model' and 'tokenizer'
# and that nltk.download('punkt') has been executed

# Load the dataset
file_path = "/content/medquad.csv"  # Adjust based on the actual filename
dataset = pd.read_csv(file_path)

# Function to generate prompt
def generate_prompt(question, answer):
    return f"""
    Question: {question}
    Answer: {answer}
    Provide a step-by-step reasoning breakdown explaining how the answer was derived.
    Each step should be clearly numbered and logically connected.
    """

# Function to extract reasoning and answer
def extract_reasoning(response):
    generated_text = response[0]["generated_text"]
    sentences = nltk.sent_tokenize(generated_text)
    answer = sentences[-1]  # Last sentence is the answer
    reasoning = [f"Step {i+1}: {step.strip()}" for i, step in enumerate(sentences[:-1]) if step]
    return reasoning, answer

# Function to generate reasoning steps
def generate_reasoning_steps(question, answer):
    prompt = generate_prompt(question, answer)
    try:
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        response = pipe(prompt, max_new_tokens=256, do_sample=True, batch_size=4)
        reasoning, final_answer = extract_reasoning(response)
    except Exception as e:
        print(f"Error during llama_pipeline call: {e}")
        reasoning = ["Error: Could not generate reasoning."]
        final_answer = "Error: Could not generate answer."
    return reasoning, final_answer

# Initialize a list to hold the reformatted entries
transformed_data_R10 = []

# Iterate over the dataset entries
for index, entry in dataset.iterrows():
    question = entry.get('Question', '').strip()
    answer = entry.get('Answer', '').strip()

    # Generate reasoning using CodeLlama
    reasoning, final_answer = generate_reasoning_steps(question, answer)

    # Define source information (using placeholders here)
    source = {
        "isbn": "000-0000000000",
        "page": 0,
        "paragraph_id": f"000-0000000-p00-para{index+1:02d}"
    }

    # Construct the reformatted entry
    reformatted_entry = {
        "question": question,
        "reasoning": reasoning,  # Include the reasoning steps
        "answer": final_answer,  # Use the final answer (single sentence)
        "source": source,
        "type": "multi_hop"  # You might need to adjust the type
    }

    # Append to the list
    transformed_data_R10.append(reformatted_entry)

transformed_R10_data = transformed_data_R10

#LOCAL DATA

In [None]:
from io import StringIO

#not working
csv = "/content/true_false_questions.csv"

# Fetch the CSV data
try:
    df = pd.read_csv(csv)  # Read CSV directly from Colab environment
except FileNotFoundError:
    print(f"Error: File not found: {csv}. Please make sure it is uploaded to Colab.")
    exit()
except pd.errors.ParserError:
    print(f"Error: Could not parse the CSV file: {csv}. Please check its format.")
    exit()

df = pd.read_csv(csv)
# Transform data
transformed_data_TF1 = []
for _, row in df.iterrows():
    formatted_item = {
        "question": row["text"],  # Extract question
        "answer": str(row["label"]),  # Extract answer as string
        "source": {
            "isbn": "000-0000000000",  # Placeholder value
            "page": 0,  # Placeholder value
            "paragraph_id": "000-0000000000-p00-paraXX"  # Placeholder value
        },
        "type": "true_false"
    }
    transformed_data_TF1.append(formatted_item)

# Save formatted data to JSON
transformed_TF1_data = transformed_data_TF1

# ENTIRE DATASET

In [None]:
import itertools

lists = [transformed_TF2_data, transformed_short_answer1_data, transformed_short_answer2_data, transformed_short_answer3_data,
        transformed_MC1_data, transformed_MC2_data, transformed_MC3_data, transformed_MC4_data
         ]
         #transformed_TF1_data,
# transformed_R1_data, transformed_R2_data,transformed_R3_data,transformed_R4_data,transformed_R5_data,transformed_R6_data,transformed_R7_data,transformed_R8_data, transformed_R9_data, transformed_R10_data
DATA = list(itertools.chain(*lists))


# Removes duplicates, tokenization, stopwords, lemmatization, padding

# Handle Class Imbalance:

#    SMOTE will help generate synthetic samples for underrepresented classes in the dataset.

#    Class Weights can be used in the model to give more importance to underrepresented classes during training.

# Paraphrasing / Question Modification:

#    We will use a GPT-based model to paraphrase or modify questions to generate additional training samples.

#SPLIT BY TYPE and SAVE .ZIP TO REPOSITORY

Notes:

    Ensure the data is preprocessed appropriately for each prompt style before creating the datasets.
    Adjust the batch size, training steps, epochs, and other hyperparameters to find the best performance.
    Regularly evaluate the performance on your test set for each prompt style.
    This example assumes your prompt styles data is readily available.
    Remember to make necessary imports and data modifications for smooth execution.


In [None]:
!pip install transformers accelerate bitsandbytes

In [None]:
"""import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Lemmatizes, removes stopwords, and lowercases the input text."""
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token.isalnum()]
    return " ".join(tokens)

def remove_duplicates_and_empty(data):
    """Removes duplicates and entries with empty questions or answers."""
    seen_pairs = set()
    filtered_data = []
    for entry in data:
        # Check if both 'question' and 'answer' keys exist
        if "question" in entry and "answer" in entry:
            q, a = entry["question"], entry["answer"]

            if not q or not a:  # Remove if empty
                continue

            pair = (q.strip().lower(), a.strip().lower())  # Normalize case for comparison
            if pair not in seen_pairs:
                seen_pairs.add(pair)
                filtered_data.append(entry)
    return filtered_data"""

In [None]:
"""import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Lemmatizes, removes stopwords, and lowercases the input text."""
    if not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in stop_words and token.isalnum()
    ]
    return " ".join(tokens)

def remove_duplicates_and_empty(data):
    """Removes duplicates and entries with missing or empty questions/answers."""
    seen_pairs = set()
    filtered_data = []

    for entry in data:
        q = entry.get("question", "")
        a = entry.get("answer", "")

        if not q or not a:
            continue

        pair = (q.strip().lower(), a.strip().lower())
        if pair not in seen_pairs:
            seen_pairs.add(pair)
            filtered_data.append(entry)

    return filtered_data

def preprocess_dataset(data):
    """Preprocesses question and answer text across various formats."""
    cleaned_data = remove_duplicates_and_empty(data)

    for entry in cleaned_data:
        entry["question"] = preprocess_text(entry.get("question", ""))
        entry["answer"] = preprocess_text(entry.get("answer", ""))

        # Special handling for multiple-choice options (if present)
        if entry.get("type") == "multiple_choice" and "options" in entry:
            processed_options = {}
            for key, val in entry["options"].items():
                processed_options[key] = preprocess_text(val)
            entry["options"] = processed_options

        # Optional: preprocess reasoning if it exists
        if "reasoning" in entry:
            entry["reasoning"] = preprocess_text(entry["reasoning"])

    return cleaned_data"""


In [None]:
#It's usually better to apply stopword removal and lemmatization before removing duplicates
#preprocessed_data = [{k: preprocess_text(v) if isinstance(v, str) else v
#                       for k, v in d.items()} for d in DATA]
#preprocessed_data = remove_duplicates_and_empty(preprocessed_data)  # Apply to DATA

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Lemmatizes, removes stopwords, and lowercases the input text."""
    if not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in stop_words and token.isalnum()
    ]
    return " ".join(tokens)

def remove_duplicates_and_empty(data):
    """Removes duplicates and entries with missing or empty questions and valid answer fields."""
    seen_pairs = set()
    filtered_data = []

    for entry in data:
        q = entry.get("question", "")
        a = entry.get("answer") or entry.get("correct_answer", "")

        if not q or not a:
            continue

        pair = (q.strip().lower(), a.strip().lower())
        if pair not in seen_pairs:
            seen_pairs.add(pair)
            filtered_data.append(entry)

    return filtered_data

def preprocess_and_clean_dataset(data):
    """Applies preprocessing to all relevant fields and removes duplicates."""
    preprocessed_data = []

    for d in data:
        processed_entry = {}

        for k, v in d.items():
            # Preprocess main text fields
            if k in ["question", "answer", "reasoning"] and isinstance(v, str):
                processed_entry[k] = preprocess_text(v)
            # Preprocess options dict (for multiple_choice)
            elif k == "options" and isinstance(v, dict):
                processed_entry[k] = {opt_k: preprocess_text(opt_v)
                                      for opt_k, opt_v in v.items()}
            else:
                processed_entry[k] = v

        preprocessed_data.append(processed_entry)

    return remove_duplicates_and_empty(preprocessed_data)


In [None]:
# Assuming DATA is your mixed-format dataset
cleaned_data = preprocess_and_clean_dataset(DATA)

# Preview a result
from pprint import pprint
pprint(cleaned_data[:4])


Split in Test and Train

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Group by Question Type
grouped_data = {}
for item in cleaned_data:
    question_type = item['type']
    if question_type not in grouped_data:
        grouped_data[question_type] = []
    grouped_data[question_type].append(item)

# 2. Stratified Split within Each Group
train_data = []
test_data = []
for question_type, data in grouped_data.items():
    # Create a temporary DataFrame for easier stratification (optional)
    df = pd.DataFrame(data)
    # Perform stratified split, using 'type' column for stratification
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['type'], random_state=42)  # Adjust test_size as needed
    # Append the split data to the overall train and test sets
    train_data.extend(train_df.to_dict('records'))
    test_data.extend(test_df.to_dict('records'))

# 3. Combine Splits
# Now you have train_data and test_data with equal representation of question types
print(f"Train data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Group by Question Type
grouped_data = {}
for item in cleaned_data:  # Use the cleaned and preprocessed data
    question_type = item.get('type', 'unknown')
    grouped_data.setdefault(question_type, []).append(item)

# 2. Stratified Split within Each Group
train_data = []
test_data = []

for question_type, data in grouped_data.items():
    df = pd.DataFrame(data)

    if len(df) < 2:
        # Not enough samples to split; assign all to train
        train_data.extend(df.to_dict('records'))
        continue

    try:
        train_df, test_df = train_test_split(
            df, test_size=0.2, stratify=df['type'], random_state=42
        )
    except ValueError:
        # Fallback if stratification fails (e.g. all types are the same)
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    train_data.extend(train_df.to_dict('records'))
    test_data.extend(test_df.to_dict('records'))

# 3. Result Summary
print(f"Train data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")


Splitting before formatting avoids potential data leakage, where information from the test set might influence the model during training.

In [None]:
# Format training data
train_short_answer_data = [
    {
        "input": f"Answer the following question:\n{d['question']}",
        "output": d["answer"]
    }
    for d in train_data if d['type'] == 'short_answer'
]

train_multi_hop_data = [
    {
        "input": f"Answer the following multi-hop question:\n{d['question']}",
        "output": f"{d['reasoning']}\nAnswer: {d['answer']}"
    }
    for d in train_data if d['type'] == 'multi_hop'
]

train_true_false_data = [
    {
        "input": f"Is the following statement true or false?\nStatement: {d['question']}",
        "output": d["answer"]
    }
    for d in train_data if d['type'] == 'true_false'
]

train_multiple_choice_data = [
    {
        "input": (
            f"Choose the correct option:\nQuestion: {d['question']}\nOptions:\n" +
            '\n'.join([f"{key}) {value}" for key, value in d['options'].items()])
        ),
        "output": d["correct_answer"]
    }
    for d in train_data
    if d['type'] == 'multiple_choice' and 'question' in d and 'options' in d and 'correct_answer' in d
]


# Format testing data
test_short_answer_data = [
    {
        "input": f"Answer the following question:\n{d['question']}",
        "output": d["answer"]
    }
    for d in test_data if d['type'] == 'short_answer'
]

test_multi_hop_data = [
    {
        "input": f"Answer the following multi-hop question:\n{d['question']}",
        "output": f"{d['reasoning']}\nAnswer: {d['answer']}"
    }
    for d in test_data if d['type'] == 'multi_hop'
]


test_true_false_data = [
    {
        "input": f"Is the following statement true or false?\nStatement: {d['question']}",
        "output": d["answer"]
    }
    for d in test_data if d['type'] == 'true_false'
]

test_multiple_choice_data = [
    {
        "input": (
            f"Choose the correct option:\nQuestion: {d['question']}\nOptions:\n" +
            '\n'.join([f"{key}) {value}" for key, value in d['options'].items()])
        ),
        "output": d["correct_answer"]
    }
    for d in test_data
    if d['type'] == 'multiple_choice' and 'question' in d and 'options' in d and 'correct_answer' in d
]



Save data to .zip

In [None]:
train_data = {
    "short_answer": [],
    "true_false": [],
    "multiple_choice": [],
    "multi_hop": []
}

test_data = {
    "short_answer": [],
    "true_false": [],
    "multiple_choice": [],
    "multi_hop": []
}

# Add your transformed data to the appropriate lists:
train_data["short_answer"].extend(train_short_answer_data)
train_data["true_false"].extend(train_true_false_data)
train_data["multiple_choice"].extend(train_multiple_choice_data)
train_data["multi_hop"].extend(train_multi_hop_data)

test_data["short_answer"].extend(test_short_answer_data)
test_data["true_false"].extend(test_true_false_data)
test_data["multiple_choice"].extend(test_multiple_choice_data)
test_data["multi_hop"].extend(test_multi_hop_data)

In [None]:
import json
import zipfile

# Convert the test_data list to JSON format
json_data_1 = json.dumps(test_data, indent=4)

# Create a zip file and write the JSON data to it
with zipfile.ZipFile("test_dataset.zip", "w") as zipf:
    zipf.writestr("test_dataset.json", json_data_1)

json_data_2 = json.dumps(train_data, indent=4)

# Create a zip file and write the JSON data to it
with zipfile.ZipFile("train_dataset.zip", "w") as zipf:
    zipf.writestr("train_dataset.json", json_data_2)

print("test/train_dataset.zip created successfully!")

In [None]:
!pip install github3.py

In [None]:
import json
import zipfile
import github3
import os
from google.colab import userdata

# 1. Get GitHub token from Secrets
github_token = userdata.get('git')

# 2. Authenticate with GitHub
gh = github3.login(token=github_token)

# 3. Repository Information
repo_owner = 'Adria100'  # Replace with your username
repo_name = 'clin_IQ'  # Replace with your repository name
repo = gh.repository(repo_owner, repo_name)

# 4. Function to create zip and upload to GitHub
def save_data_to_zip_and_upload(data_dict, zip_file_name):
    with zipfile.ZipFile(zip_file_name, "w") as zipf:
        for data_type, data_list in data_dict.items():
            file_name = f"{data_type}_data.json"
            with zipf.open(file_name, "w") as f:
                f.write(json.dumps(data_list, indent=4).encode())

    # Upload the zip file to GitHub
    with open(zip_file_name, "rb") as f:
        content = f.read()
        repo.create_file(
            path=f"data/processed/{zip_file_name}",  # Path in the repository
            message=f"Adding {zip_file_name}",  # Commit message
            content=content,
            branch='main'  # Replace with your branch name if needed
        )

    print(f"Uploaded {zip_file_name} to GitHub")
    os.remove(zip_file_name)  # Remove local zip file

# 5. Assuming you have train_data and test_data dictionaries populated
# ... (your code to populate train_data and test_data) ...

# 6. Save and upload the zip files
save_data_to_zip_and_upload(train_data, "train_dataset.zip")
save_data_to_zip_and_upload(test_data, "test_dataset.zip")

In [None]:
import json
import zipfile
import github3
import os
from google.colab import userdata

# 1. Get GitHub token from Secrets
github_token = userdata.get('git')

# 2. Authenticate with GitHub
gh = github3.login(token=github_token)

# 3. Repository Information
repo_owner = 'Adria100'  # Replace with your username
repo_name = 'clin_IQ'  # Replace with your repository name

# Check if the repository exists before trying to access it
try:
    repo = gh.repository(repo_owner, repo_name)
    if repo is None:
        raise github3.exceptions.NotFoundError("Repository not found")
except github3.exceptions.NotFoundError as e:
    print(f"Error: {e}")
    # Handle the error, e.g., exit or provide instructions to the user
    exit() # You can replace this with your error handling

# 4. Function to create zip and upload to GitHub
def save_data_to_zip_and_upload(data_dict, zip_file_name):
    with zipfile.ZipFile(zip_file_name, "w") as zipf:
        for data_type, data_list in data_dict.items():
            file_name = f"{data_type}_data.json"
            with zipf.open(file_name, "w") as f:
                f.write(json.dumps(data_list, indent=4).encode())

    # Upload the zip file to GitHub
    with open(zip_file_name, "rb") as f:
        content = f.read()
        repo.create_file(
            path=f"data/processed/{zip_file_name}",  # Path in the repository
            message=f"Adding {zip_file_name}",  # Commit message
            content=content,
            branch='main'  # Replace with your branch name if needed
        )

    print(f"Uploaded {zip_file_name} to GitHub")
    os.remove(zip_file_name)  # Remove local zip file

# 5. Assuming you have train_data and test_data dictionaries populated
# ... (your code to populate train_data and test_data) ...

# 6. Save and upload the zip files
save_data_to_zip_and_upload(train_data, "train_dataset.zip")
save_data_to_zip_and_upload(test_data, "test_dataset.zip")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer

model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
"""inputs = tokenizer(
    your_data,
    padding="max_length",  # Pad to the maximum length
    truncation=True,        # Truncate if exceeding the maximum length
    max_length=512,        # Adjust the maximum length as needed
    return_tensors="pt"     # Return PyTorch tensors
)"""
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             load_in_8bit=True,
                                             device_map='auto')

In [None]:
class MultiPromptTrainer(Trainer):
    def __init__(self, *args, prompt_styles_data, **kwargs):
        super().__init__(*args, **kwargs)
        self.prompt_styles_data = prompt_styles_data  # Store data for each style

    def training_step(self, model, inputs):
        # Iterate over each prompt style
        for style, data in self.prompt_styles_data.items():
            # Create a dataloader for the current style
            train_dataloader = self.get_train_dataloader(data)

            # Perform a training step for the current style
            for step, batch in enumerate(train_dataloader):
              batch = batch.to(self.args.device)
              outputs = model(**batch)
              loss = outputs.loss
              loss.backward()
              self.optimizer.step()
              self.optimizer.zero_grad()

        return {'loss': loss.item()}  # Return the loss

In [None]:
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    per_device_train_batch_size=4,  # Batch size per device
    gradient_accumulation_steps=4,  # Gradient accumulation steps
    num_train_epochs=3,              # Number of training epochs
    fp16=True,                       # Enable mixed precision training
    logging_dir='./logs',            # Directory for storing logs
    learning_rate=2e-5,             # Learning rate
    weight_decay=0.01,              # Weight decay
    optim="adamw_torch",
    save_strategy="epoch"
)

trainer = MultiPromptTrainer(
    model=model,
    args=training_args,
    train_dataset=None,   # Not used in this example
    prompt_styles_data={
        "short_answer": train_short_answer_data,
        "multi_hop": train_multi_hop_data,
        "true_false": train_true_false_data,
        "multiple_choice": train_multiple_choice_data
    }
)

trainer.train()

In [None]:
trainer.save_model("./fine_tuned_llama")
tokenizer.save_pretrained("./fine_tuned_llama")