In [1]:
import pandas as pd    
from model import initialize_model, call_llm


In [2]:

def preprocess(file_path):
    data = pd.read_json(path_or_buf=file_path, lines=True)
    ids = set()
    rows_to_be_dropped = []
    for i in range(len(data)):
        row = data.iloc[i]
        if row.unique_id in ids:
            rows_to_be_dropped.append(i)
        else:
            ids.add(row.unique_id)
    data = data.drop(rows_to_be_dropped)
    return data

In [None]:
data = preprocess("./data/train.jsonl")

In [5]:
len(data)

8194

In [8]:
data.to_json("./data/train_cleaned.jsonl", orient="records", lines=True)

In [None]:
print(set([x for x in data["label"]]))

In [None]:
print(len(data))

In [5]:
print(data.head())

     source  citeEnd                                        sectionName  \
0  explicit    175.0                                       Introduction   
1  explicit     36.0  Novel Quantitative Trait Loci for Seminal Root...   
2  explicit    228.0                                       Introduction   
3  explicit    110.0                                         Discussion   
4  explicit    239.0                                         Discussion   

   citeStart                                             string       label  \
0      168.0  However, how frataxin interacts with the Fe-S ...  background   
1       16.0  In the study by Hickey et al. (2012), spikes w...  background   
2      225.0  The drug also reduces catecholamine secretion,...  background   
3       46.0  By clustering with lowly aggressive close kin ...  background   
4      234.0  Ophthalmic symptoms are rare manifestations of...  background   

   label_confidence                             citingPaperId  \
0        

In [11]:
print(data.iloc[0].string)

However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).


In [12]:
print(len(data))

8243


In [7]:
labels = []
reasonings = []
raw_output = []
client, _ = initialize_model("")
for i in range(50):
    current_data = data.iloc[i]
    response = call_llm(client, "", current_data.string, current_data.sectionName)
    raw_output.append(response)
    labels.append(response[0])
    reasonings.append(response[1])

NameError: name 'data' is not defined

In [19]:
reasonings = [" ".join([x.strip() for x in y[1:]]) for y in raw_output]

In [20]:
print(reasonings)

['This is the introduction section, providing an overview of the current knowledge gap in the field, and setting the stage for the rest of the text.', ' This study appears to be providing background information on the context and methods used to collect the data, including the specific procedures used to sample, dry, and store the grains, as well as the purpose of the study.', 'This is the introduction section that describes the benefits and effects of the drug, specifically its impact on heart rate and blood pressure, making it a result.', 'This text provides the conclusion or outcome of the research, specifically discussing the potential benefits of breeding females clustering with lowly aggressive close kin.', 'This is a transitional sentence that serves to introduce the discussion, summarizing the key points discussed in the previous section, which is a common approach in academic writing.', 'This is the reasoning  The classification is "result" because the text describes the outco

In [14]:
print(reasonings)

['This is the introduction section, providing an overview of the current knowledge gap in the field, and setting the stage for the rest of the text.', '', 'This is the introduction section that describes the benefits and effects of the drug, specifically its impact on heart rate and blood pressure, making it a result.', 'This text provides the conclusion or outcome of the research, specifically discussing the potential benefits of breeding females clustering with lowly aggressive close kin.', 'This is a transitional sentence that serves to introduce the discussion, summarizing the key points discussed in the previous section, which is a common approach in academic writing.', 'This is the reasoning', 'This section appears to be the discussion section of a research paper, specifically focusing on the importance of early diagnosis and immediate treatment, citing references [17, 18]. The language used is formal and academic, indicating a methodological or interpretive section, rather than 

In [8]:
print(labels)

['result', 'background', 'result', 'result', 'background', 'result', 'method', 'result', 'method', 'result', 'result', 'result', 'background', 'background', 'result', 'background', 'result', 'method', 'method', 'result', 'result', 'background', 'result', 'result', 'result', 'result', 'method', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'method', 'result', 'method', 'result', 'result', 'result', 'result', 'result', 'background', 'method', 'method', 'result', 'result', 'result', 'result', 'background', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'result', 'method', 'method', 'result', 'result', 'result', 'result', 'background', 'result', 'result', 'result', 'result', 'result', 'background', 'result', 'background', 'background', 'result', 'result', 'result', 'method', 'result', 'result', 'result', 'method', 'result', 'method', 'result', 'result', '

In [21]:
df = pd.DataFrame(zip(labels, reasonings), columns=["id", "reasoning"])
df.to_csv("first_50.csv", )