In [1]:
import pandas as pd
# Use a pipeline as a high-level helper
from datasets import Dataset
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import os


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
hf_token = os.getenv('HUGGINGFACE_API_KEY')
model_id = "meta-llama/Llama-3.2-1B-Instruct"
model_name = model_id.split("/")[-1]

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    token=hf_token
)

In [None]:
messages = [
    {"role": "user", "content": "Who are you?"},
]
# local model in variable model
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=2048,
    temperature=0.7,
    top_p=0.9,
)
pipe(messages)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': 'I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."'}]}]

In [72]:
def call_pipe(pipe, section_name, content):
    messages = [{"role": "system", "content": 
                 "You are an AI assistant who will assess whether a given text is used as a background, result or method section in a scientific paper."
                 "You will be given a section name and the text, and you will need to classify the text as one of [background, result, method]."
                 "You will also need to provide a reasoning for your classification."
                 "The output should strictly be a json with the following two keys: classification, reasoning."
                #  "Example output would look like: {\"classification\": \"result\", \"reasoning\": \"This is the reasoning\"}"
    }, 
    {"role": "user", "content": f"section name: {section_name}, text: {content}"}]
    response = pipe(messages)[0]
    try:
        if response["generated_text"][2]["content"]:
            print("===" * 20)
            print(response["generated_text"][2]["content"])
            print("===" * 20)
            content = response["generated_text"][2]["content"]
            start = content.find("{")
            end = content.rfind("}") + 1
            # if model didn't end with a closing brace, add it
            if end <= 0:
                content += "\n}"
                json_str = content[start:]
            else:
                json_str = content[start:end]
            print(json_str)
            answers = json.loads(json_str)
            classification = answers["classification"]
            reasoning = answers["reasoning"]
            return classification, reasoning
        else:
            return "Invalid", "No reasoning provided"
    except Exception as e:
        print(e)
        return "Invalid", "No reasoning provided"

In [74]:
model_classification, model_reasoning = call_pipe(pipe, "Introduction", "However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{
    "classification": "method",
    "reasoning": "The text is describing an experiment or observation that was conducted to study the interaction between frataxin and the Fe-S cluster biosynthesis components. It mentions specific components (IscS, IscU/Isu1, or ISD11/Isd11) and provides context about how they interact, which is typical of a method section in a scientific paper."
{
    "classification": "method",
    "reasoning": "The text is describing an experiment or observation that was conducted to study the interaction between frataxin and the Fe-S cluster biosynthesis components. It mentions specific components (IscS, IscU/Isu1, or ISD11/Isd11) and provides context about how they interact, which is typical of a method section in a scientific paper."
}


In [75]:
print(f"{model_classification}")
print(f"{model_reasoning}")

method
The text is describing an experiment or observation that was conducted to study the interaction between frataxin and the Fe-S cluster biosynthesis components. It mentions specific components (IscS, IscU/Isu1, or ISD11/Isd11) and provides context about how they interact, which is typical of a method section in a scientific paper.


In [76]:
def preprocess(file_path):
    data = pd.read_json(path_or_buf=file_path, lines=True)
    ids = set()
    rows_to_be_dropped = []
    for i in range(len(data)):
        row = data.iloc[i]
        if row.unique_id in ids:
            rows_to_be_dropped.append(i)
        else:
            ids.add(row.unique_id)
    data = data.drop(rows_to_be_dropped)
    return data

In [77]:
data = preprocess("./data/train.jsonl")

In [78]:
first_partition = data[:5] #1365]

In [79]:
labels = []
reasonings = []
raw_output = []
ids = []

for i in range(len(first_partition)):
    current_data = first_partition.iloc[i]
    ids.append(current_data.unique_id)
    classification, reasoning = call_pipe(pipe, current_data.sectionName, current_data.string)
    raw_output.append(classification)
    labels.append(classification)
    reasonings.append(reasoning)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{
  "classification": "result",
  "reasoning": "The given text states that direct interactions between frataxin and each component of the Fe-S cluster biosynthesis were reported, implying that the interaction is the result of the study's findings."
}
{
  "classification": "result",
  "reasoning": "The given text states that direct interactions between frataxin and each component of the Fe-S cluster biosynthesis were reported, implying that the interaction is the result of the study's findings."
}


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{
  "classification": "method",
  "reasoning": "This text is describing the experimental procedure or method used to collect data. The words'spikes', 'field', 'physiological','maturity', 'dried', 'grain threshed','stored', 'dormancy', 'germination', and 'testing' indicate that the purpose of the section is to explain how the data was collected, rather than to describe the results or discuss the methodology in a general sense."
{
  "classification": "method",
  "reasoning": "This text is describing the experimental procedure or method used to collect data. The words'spikes', 'field', 'physiological','maturity', 'dried', 'grain threshed','stored', 'dormancy', 'germination', and 'testing' indicate that the purpose of the section is to explain how the data was collected, rather than to describe the results or discuss the methodology in a general sense."
}


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{
  "classification": "result",
  "reasoning": "The given text describes the effect of the drug on heart rate and blood pressure, which is a direct outcome of its mechanism of action. The text also mentions a comparison with midazolam, which is another drug, indicating that the text is describing a result or outcome of a comparison. The text does not describe a method or process, but rather describes a phenomenon or effect, which is typical of results."
{
  "classification": "result",
  "reasoning": "The given text describes the effect of the drug on heart rate and blood pressure, which is a direct outcome of its mechanism of action. The text also mentions a comparison with midazolam, which is another drug, indicating that the text is describing a result or outcome of a comparison. The text does not describe a method or process, but rather describes a phenomenon or effect, which is typical of results."
}


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{
  "classification": "method",
  "reasoning": "This text is describing a method (clustering) used in a scientific study, which is a common practice in the field of biology and ecology. The text explains the purpose and outcome of the method, indicating its use in studying the relationship between territorial boundaries and net energy income in female animals."
}
{
  "classification": "method",
  "reasoning": "This text is describing a method (clustering) used in a scientific study, which is a common practice in the field of biology and ecology. The text explains the purpose and outcome of the method, indicating its use in studying the relationship between territorial boundaries and net energy income in female animals."
}
{
  "classification": "result",
  "reasoning": "This section describes the manifestations or outcomes of the intracranial arachnoid cyst, specifically ophthalmic symptoms. It reports on the types of symptoms that can occur, rather than explaining the cause or mechanis

In [80]:
df = pd.DataFrame(zip(ids, labels, reasonings), columns=["id", "model_classification", "reasoning"])
df.to_csv(f"{model_name}_first_partition.csv")