---
## Model  
---
### Content
---

- **[Libraries to use](#Libraries_to_use)**

- **[Loading the dataset](#Loading_dataset)**

- **[Load tokenizer and model](#tokenizer_model)**

- **[Load tokenizer and model](#tokenizer_model)**

- **[Tokenize the datasets](#tokenize)**

- **[Creating a dictionary with train, test, validation datasets](#train_test_validation)**

- **[Training model](#Training_model)**

 ---

<a id="Libraries_to_use"> </a>

---
### Libraries to use 
---

In [1]:
import pandas as pd
import os
os.environ['WANDB_DISABLED'] = 'true'
#--------------------------------------------------------
import datasets
from datasets import Dataset
import torch
torch.cuda.is_available()
#--------------------------------------------------------
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
#--------------------------------------------------------
from endogpt.Preprocessor import preprocess_real
from endogpt.Preprocessor import preprocess_synthetic
from endogpt.Classifier import train_test_validation 

<a id="Loading_dataset"> </a>

---
### Loading the dataset 
---

In [2]:
string = pd.read_csv('data/real.csv')
real = preprocess_real(string)
real

Unnamed: 0,General Practitioner,Endoscopist,Instrument,Extent of Exam,Indications,findings
0,Dr. Taylor,Dr. el-Hasen,FG2,D1,Ongoing reflux symptoms.,Columnar lined oesophagus is present. The segm...
1,Dr. Cheek,Dr. el-Hasen,FG4,Oesophagus,Endoscopic ultrasound findings,There is an ulcer in the stomach which is supe...
2,Dr. al-Zamani,Dr. Hall,FG7,D1,Nausea and/or Vomiting Haematemesis or Melaen...,LA Grade D oesophagitis. The oesopahgitis is ...
3,Dr. el-Hussein,Dr. Lee,FG7,D2,IDA,There is a polyp in the body which is sessile ...
4,Dr. Hendricks,Dr. Lee,FG3,Pylorus,Dysphagia/Odynophagia .,There is a stricture in the fundus which is Oe...
...,...,...,...,...,...,...
49995,Dr. Salvador-Rojas,Dr. Nguyen,FG1,Failed intubation,CD,There is a polyp in the oesophagus at 22 cm wh...
49996,Dr. el-Haq,Dr. Burns,FG7,Oesophagus,Other - chronic cough ?GORD,There is a nodule in the oesophagus at 39 cm w...
49997,Dr. Hassan,Dr. Nguyen,FG2,Stomach body,Other- liver abscesses,Normal gastroscopy to the duodenum.
49998,Dr. Murray,Dr. el-Hasen,FG2,Pylorus,Weight Loss Nausea and/or Vomiting Other- Ear...,Normal gastroscopy to the duodenum.


In [3]:
real["text"] = real['General Practitioner'] + real['Endoscopist'] + real['Instrument'] + 'INDICATIONS FOR PROCEDURE:' + real['Indications'] + 'Extent of Exam:'+ real['Extent of Exam'] +'FINDINGS: '+ real['findings']
df = real[["text"]]

In [4]:
df.text.iloc[0]

' Dr. Taylor Dr. el-Hasen  FG2  INDICATIONS FOR PROCEDURE: Ongoing reflux symptoms. Extent of Exam:  D1  FINDINGS: Columnar lined oesophagus is present. The segment looks flat. Some areas of vascular abnormalities are seen. No abnormal pit pattern is seen. NA'

In [5]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['text'],
    num_rows: 50000
})

<a id="tokenizer_model"> </a>

---
### Load tokenizer and model 
---

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("tombrooks248/EndoGPT")#.to('cuda')

<a id="tokenize"> </a>

---
### Tokenize the datasets 
---

In [7]:
def tokenize(batch):
    return tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)

<a id="train_test_validation"> </a>

---
### Creating a dictionary with train, test, validation datasets 
---

In [8]:
ds = ds.map(tokenize, num_proc=4, batched=True)
ds = ds.remove_columns(["text"])
tts_ds = ds.train_test_split(test_size=0.3)
tts_ds
block_size = 64

Map (num_proc=4):   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_datasets = tts_ds.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/35000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/15000 [00:00<?, ? examples/s]

In [11]:
tokenizer.decode(lm_datasets["train"][17]["input_ids"])

'. No nodularity is present. Short segment only. The segment looks flat. NA <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

<a id="Training_model"> </a>

---
### Training model 
---

In [12]:
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    output_dir="models",
    report_to=None
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],

)

In [None]:
trainer.train()

### Complete model training function

In [12]:
def Model(string):
    df = pd.read_csv('data/real.csv')
    real = preprocess_real(df)
    real["text"] = real['General Practitioner'] + real['Endoscopist'] + real['Instrument'] + 'INDICATIONS FOR PROCEDURE:' + real['Indications'] + 'Extent of Exam:'+ real['Extent of Exam'] +'FINDINGS: '+ real['findings']
    ds = Dataset.from_pandas(df)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt", use_fast=True)
    model = AutoModelForCausalLM.from_pretrained("tombrooks248/EndoGPT")#.to('cuda')
    def tokenize(batch):
        return tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)
    ds = ds.map(tokenize, num_proc=4, batched=True)
    ds = ds.remove_columns(["text"])
    tts_ds = ds.train_test_split(test_size=0.3)
    tts_ds
    block_size = 64
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    lm_datasets = tts_ds.map(
        group_texts,
        batched=True,
        batch_size=1000,
        num_proc=4,
    )
    tokenizer.decode(lm_datasets["train"][17]["input_ids"])

    training_args = TrainingArguments(
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        output_dir="models",
        report_to=None
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],

    )
    trainer.train()
    return trainer