In [1]:
!pip install transformers
!pip install datasets
!pip install accelerator
!pip install farm-haystack[colab,preprocessing,elasticsearch,inference,metrics]

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerator
  Downloading accelerator-2023.11.3.dev1-cp310-cp310-manylinux_2_17_

In [1]:
from datasets import load_dataset

dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 112165
    })
})

In [2]:
test_size = 0.2

dataset = dataset["train"].train_test_split(test_size=test_size)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 89732
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 22433
    })
})

In [3]:
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

In [4]:
import pandas as pd
import numpy as np

def generate_squad_columns(df, title):
    df.reset_index(inplace = True)
    df.rename(columns={"index": "id", "input": "question", "output": "answer"}, inplace=True)
    df["review_id"] = df["id"].apply(lambda x: f"Doc_Case_{x}")
    df["title"] = title
    df["is_impossible"] = df.apply(lambda row: False if row["answer"]=='' or pd.isnull(row["answer"]) else True, axis=1)
    df["context"] = df["question"] + ' ' + df["answer"]
    df["answer_start"] = df["question"].apply(lambda x: len(x)+1)
    df.drop(columns=["instruction"], inplace=True)
    return df


In [5]:
train_df = generate_squad_columns(train_df, "DoctorChat")
test_df = generate_squad_columns(test_df, "DoctorChat")


In [7]:
test_df["context"][0]

'I treated myself for a yeast infection and it didnt work, I have excess white stuff and I do not smell normal, almost like milk about to go bad. My boyfriend smells like fish sometimes I can not even stand it. I think we have something. What could it be? I welcome you at my virtual clinic! Thanks for consulting me at Chat Doctor. I have thoroughly worked through your case, and I can easily realize your health worries. Being your physician, I assure you not to worry as things will settle down very soon. Causes of your condition may be'

In [9]:
test_df["question"][0]

'I treated myself for a yeast infection and it didnt work, I have excess white stuff and I do not smell normal, almost like milk about to go bad. My boyfriend smells like fish sometimes I can not even stand it. I think we have something. What could it be?'

In [8]:
test_df["answer"][0]

'I welcome you at my virtual clinic! Thanks for consulting me at Chat Doctor. I have thoroughly worked through your case, and I can easily realize your health worries. Being your physician, I assure you not to worry as things will settle down very soon. Causes of your condition may be'

In [10]:
def create_paragraphs(df):
    paragraphs = []
    id2context = dict(zip(df["review_id"], df["context"]))

    for review_id, review in id2context.items():
        qas = []
        # Filter for all question-answer pairs about a specific context
        review_df = df.query(f"review_id == '{review_id}'")
        id2question = dict(zip(review_df["id"], review_df["question"]))

        # Build up the qas array
        for qid, question in id2question.items():
            # Filter for a single question ID
            question_df = df.query(f"id == {qid}").to_dict(orient="list")

            ans_start_idxs = question_df["answer_start"]
            ans_text = question_df["answer"]

            # Fill answerable questions
            if len(ans_start_idxs):
                answers = [
                    {"text": text, "answer_start": answer_start}
                    for text, answer_start in zip(ans_text, ans_start_idxs)]
                is_impossible = False
            else:
                answers = []
                is_impossible = True

            # Add question-answer pairs to qas
            qas.append({"question": question,
                        "id": qid,
                        "is_impossible": is_impossible,
                        "answers": answers})

        # Add context and question-answer pairs to paragraphs
        paragraphs.append({"qas": qas, "context": review})
    return paragraphs

In [11]:
import json

def convert_to_squad(dfs):
    for split, df in dfs.items():
        subjqa_data = {}
        # Create 'paragraphs' for each product ID
        groups = (df.groupby("title").apply(create_paragraphs)
            .to_frame(name="paragraphs").reset_index())

        subjqa_data["data"] = groups.to_dict(orient="records")

        # Save the result to disk
        with open(f"medical_chat-{split}.json", "w+", encoding="utf-8") as f:
            json.dump(subjqa_data, f)


In [13]:
dfs = {"train": train_df,
       "test": test_df}

In [14]:
convert_to_squad(dfs)

In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
from haystack.nodes import FARMReader

In [3]:
from haystack.nodes import FARMReader

max_seq_length, doc_stride = 512, 256

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=True,
                    return_no_answer=True,
                    #doc_stride=doc_stride
                    max_seq_len=max_seq_length)

In [None]:
data_dir = "."

reader.train(data_dir=data_dir,
             train_filename="medical_chat-train.json",
             use_gpu=True,
             n_epochs=10,
             save_dir="model",
             dev_filename="medical_chat-test.json",
             batch_size=8)

Preprocessing dataset:   0%|          | 0/176 [00:00<?, ? Dicts/s]Token indices sequence length is longer than the specified maximum sequence length for this model (620 > 512). Running this sequence through the model will result in indexing errors
Preprocessing dataset: 100%|██████████| 176/176 [02:31<00:00,  1.16 Dicts/s]
ERROR:haystack.modeling.data_handler.processor:Unable to convert 6 samples to features. Their ids are : 470-0-0, 486-0-0, 355-0-0, 444-0-0, 452-0-0, 465-0-0
Preprocessing dataset: 100%|██████████| 44/44 [00:37<00:00,  1.17 Dicts/s]
ERROR:haystack.modeling.data_handler.processor:Unable to convert 9 samples to features. Their ids are : 408-0-0, 470-0-0, 385-0-0, 486-0-0, 355-0-0, 444-0-0, 236-0-0, 452-0-0, 465-0-0
Train epoch 0/9 (Cur. train loss: 4.2165):   3%|▎         | 300/11518 [04:03<2:30:48,  1.24it/s]
Evaluating:   0%|          | 0/2885 [00:00<?, ?it/s][A
Evaluating:   1%|          | 35/2885 [00:10<13:54,  3.42it/s][A
Evaluating:   2%|▏         | 72/2885 [00:

In [None]:
del reader

In [None]:
reader = FARMReader(model_name_or_path="model")

In [None]:
reader.predict_on_texts(question=test_df["question"][0],
                        texts=[test_df["context"][0]])