In [1]:
import pandas as pd

df = pd.read_json("../data/interim/250304_chunks_with_questions.json")
df.head()

Unnamed: 0,text,is_useful,question
0,The work is made available under the Creative ...,True,What is the status of data availability in res...
1,RESEARCH ARTICLE\n\n# Likelihood of Null Effec...,True,What trend has been observed in the outcomes o...
2,### Methods\n\nWe identified all large NHLBI s...,True,What trend has been observed in the reporting ...
3,### Results\n\n17 of 30 studies (57%) publishe...,True,How has the trend of reporting positive result...
4,"Prospective declaration of outcomes in RCTs, a...",True,What practices in randomized clinical trials m...


In [2]:
from datasets import load_dataset
ds = load_dataset("lmsys/lmsys-chat-1m")["train"]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
messages = (
    ds.filter(
        lambda x: (
            len(x["conversation"]) == 2 and x["conversation"][0]["role"] == "user"
        )
    )
    .map(lambda x: {"sample_message": x["conversation"][0]["content"]})
    .with_format("pandas")
)

Check how many real inputs are lower case so we can modify the synthetic questions accordingly

In [4]:
general_inputs = messages["sample_message"]
frac_lower = general_inputs.apply(lambda x: x[0].islower()).sum() / general_inputs.size
print(frac_lower)

0.21904035115918885


In [5]:
sampled_indices = df.sample(frac=frac_lower).index
df.loc[sampled_indices, "question"] = df.loc[sampled_indices, "question"].str.lower()

In [6]:
# ditch the anonymized questions
is_anonymized = general_inputs.apply(lambda x: "NAME_" in x)

sampled_general_questions = general_inputs.loc[~is_anonymized].sample(
    n=min(9 * len(df), (~is_anonymized).sum()), replace=False
)

In [7]:
from datasets import Dataset

pos_examples = [{"text": row.question, "label": 1} for _, row in df.iterrows()]
neg_examples = [
    {"text": item, "label": 0} for _, item in sampled_general_questions.items()
]

ds_questions = Dataset.from_list(pos_examples + neg_examples).train_test_split(
    test_size=0.2, shuffle=True, seed=42
)

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [10]:
ds_questions = ds_questions.map(preprocess_function, batched=True)

Map: 100%|██████████| 7648/7648 [00:00<00:00, 22364.93 examples/s]
Map: 100%|██████████| 1912/1912 [00:00<00:00, 25239.85 examples/s]


In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
id2label = {0: "GENERAL", 1: "OPEN_SCIENCE"}
label2id = {"GENERAL": 0, "OPEN_SCIENCE": 1}

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="question_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_questions["train"],
    eval_dataset=ds_questions["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

HfHubHTTPError: (Request ID: Root=1-6800c8ed-1d5c5ae62686934a28cd1a6c;6ad2b57d-8308-48af-9baf-35a206778f68)

403 Forbidden: You don't have the rights to create a model under the namespace "michielree".
Cannot access content at: https://huggingface.co/api/repos/create.
Make sure your token has the correct permissions.

In [1]:
import json


with open("../../data/interim/temp_index/vs_241218_bge-small-en-v1.5/docstore.json", "r", encoding="utf-8") as f:
    chunked_paragraphs_dict = json.load(f)

In [3]:
from pathlib import Path
import pandas as pd

df_os_questions = pd.read_csv("../../data/interim/questions_and_text.csv")

In [4]:
df_os_questions

Unnamed: 0,question,text
0,Why is reproducible research considered a key ...,arXiv:1802.03311v1 [cs.DL] 9 Feb 2018Termino...
1,What is the significance of reproducible resea...,"If B, then they are commonly divided in two ca..."
2,What is a key method used in open science to e...,The goal was to merge a publication with its u...
3,What are the key components necessary for ensu...,Terminologies for Reproducible Research\ntradi...
4,What factors are essential for ensuring reprod...,(2009 ). This appears\nto be the ﬁrst article ...
...,...,...
1277,What factors contribute to the sustainability ...,This is both a good and a \nworrisome thing. W...
1278,What are some contributions to the understandi...,"_____ and _____. ""Toward a New Economics of ..."
1279,What are the key issues addressed in the disco...,_____. “On ‘Trading Trade Secrets’ and the Eco...
1280,What concept focuses on enhancing scientific c...,_____. “Koyaanisqatsi in Cyberspace: The econo...
