The following code cell ensures that the underlying `.py` file gets automatically reloaded after you save any changes.  

In [13]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd    
from model import initialize_model, call_llm

In [3]:

def preprocess(file_path):
    data = pd.read_json(path_or_buf=file_path, lines=True)
    ids = set()
    rows_to_be_dropped = []
    for i in range(len(data)):
        row = data.iloc[i]
        if row.unique_id in ids:
            rows_to_be_dropped.append(i)
        else:
            ids.add(row.unique_id)
    data = data.drop(rows_to_be_dropped)
    return data

In [4]:
data = preprocess("./data/train.jsonl")

In [5]:
print(set([x for x in data["label"]]))

{'method', 'background', 'result'}


In [6]:
print(data.head())

     source  citeEnd                                        sectionName  \
0  explicit    175.0                                       Introduction   
1  explicit     36.0  Novel Quantitative Trait Loci for Seminal Root...   
2  explicit    228.0                                       Introduction   
3  explicit    110.0                                         Discussion   
4  explicit    239.0                                         Discussion   

   citeStart                                             string       label  \
0      168.0  However, how frataxin interacts with the Fe-S ...  background   
1       16.0  In the study by Hickey et al. (2012), spikes w...  background   
2      225.0  The drug also reduces catecholamine secretion,...  background   
3       46.0  By clustering with lowly aggressive close kin ...  background   
4      234.0  Ophthalmic symptoms are rare manifestations of...  background   

   label_confidence                             citingPaperId  \
0        

In [6]:
print(data.iloc[0].string)

However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).


In [7]:
print(len(data))

8194


In [8]:
labels = []
reasonings = []
raw_output = []
ids = []
client, _ = initialize_model("")
for i in range(50):
    current_data = data.iloc[i]
    ids.append(current_data.unique_id)
    classification, reasoning = call_llm(client, "", current_data.string, current_data.sectionName)
    raw_output.append(classification)
    labels.append(classification)
    reasonings.append(reasoning)

ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='{\n"classification": "background",\n"reasoning": "The text is discussing the current state of knowledge in the field, referencing previous studies and highlighting the gap in understanding. This is typical of an introduction section, which sets the stage for the research by providing context and summarizing the current understanding of the topic."\n}', tool_calls=None), logprobs=None)], created=1743231178, id='', model='meta-llama/Meta-Llama-3-8B-Instruct', system_fingerprint='3.2.1-native', usage=ChatCompletionOutputUsage(completion_tokens=65, prompt_tokens=195, total_tokens=260), object='chat.completion')
ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='{"classification": "method", "reasoning": "This text appears to describe a spec

In [9]:
print(labels)
print(reasonings)

['background', 'method', 'background', 'background', 'background', 'method', 'background', 'method', 'background', 'background', 'background', 'background', 'result', 'method', 'method', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'method', 'method', 'background']
['The text is discussing the current state of knowledge in the field, referencing previous studies and highlighting the gap in understanding. This is typical of an introduction section, which sets the stage for the research by providing context and summarizing the current understanding of the topic.', 'This text appears to describe a specific me

In [10]:
# reasonings = [" ".join([x.strip() for x in y[1:]]) for y in raw_output]

In [11]:
print(labels)

['background', 'method', 'background', 'background', 'background', 'method', 'background', 'method', 'background', 'background', 'background', 'background', 'result', 'method', 'method', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'method', 'method', 'background']


In [10]:
df = pd.DataFrame(zip(ids, reasonings), columns=["id", "reasoning"])
df.to_csv("first_50.csv")