In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]
!pip install pandas
!pip install numpy==1.20.3

In [None]:
data_dir = "../data/"

In [None]:
import pandas as pd
data_file = data_dir + "setup8_training.csv"
num_examples=7000
df = pd.read_csv(data_file).dropna()[:num_examples]

In [None]:
df.sample(10)

In [None]:
prefix = "Summarize the following text as a discharge summary report: "
df["source"] = prefix +df["source"]

In [None]:
df.head()

In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
    """display dataframe in ASCII format"""
    console=Console()
    table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)
    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])
    console.print(table)
    
    
    
training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"),
                        title="Training Status",pad_edge=False, box=box.ASCII)

In [None]:
# Setting up the device for GPU usage
device = torch.device("mps")

In [None]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model

    """
    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]
    
    def __len__(self):
        return len(self.target_text)
    
    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
            }

In [None]:
import pickle
from datetime import datetime
import csv
from tqdm import tqdm

DEFAULT_DEVICE = "mps"
def create_csv(all_sentences, targets, file_to_write):
    sources = []
    fieldnames = ["source", "target"]
    test_array = []
    with open(file_to_write, 'w') as csvfile:
        csvwriter = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames)
        for t in range(len(targets)):
            test_array.append({"source": all_sentences[t], "target": targets[t]})
        csvwriter.writerow(dict((fn,fn) for fn in fieldnames))
        for row in test_array:
            csvwriter.writerow(row)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def generate_summaries(lns, metric, model, tokenizer, result_file_name, batch_size=1, device=DEFAULT_DEVICE):
    prefix = "Summarize the following text as a discharge summary report: "
    article_batches = list(chunks(lns['source'].values, batch_size))
    article_batches = list(map(lambda x: list(prefix + x), article_batches))
    target_batches = list(chunks(lns['target'].values, batch_size))
    ls_prediction = []
    ls_groundtruth = []

    dec_batches_untokenized = []
    target_batches_untokenized = []

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches)
    , total=len(article_batches)):
        dct = tokenizer(article_batch,
                        max_length=512,
                        truncation=True,
                        padding='max_length',
                        return_tensors="pt")
        dec = []
        summaries = model.generate(input_ids=dct["input_ids"].to(device),
                                     attention_mask=dct["attention_mask"].to(device),
                                     num_beams=10,min_length=120,max_length=512,repetition_penalty=2.5, length_penalty=1.0)
        dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]  
        dec_batches_untokenized.append(dec)
        target_batches_untokenized.append(target_batch)
        ls_prediction.extend(dec)
        ls_groundtruth.extend(target_batch)
        
    ls_prediction_tokenized = coreNLP_tokenizer(ls_prediction)
    target_batch_tokenized = coreNLP_tokenizer(ls_groundtruth)

    metric.add_batch(predictions=ls_prediction_tokenized, references=target_batch_tokenized)

    score = metric.compute()
    str_now = str(datetime.now())
    
    create_csv(ls_groundtruth, ls_prediction, result_file_name)
    return score

In [None]:
!pip install rouge_score
!pip install datasets

from datasets import list_datasets, list_metrics, load_dataset, load_metric
metrics_list = list_metrics()
len(metrics_list)
print (metrics_list)
rouge_metric = load_metric('rouge')

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    """
    Function to be called for training with the parameters passed from main function

    """
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        if _%10==0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    """
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
            generated_ids = model.generate(
                            input_ids = ids,
                            attention_mask = mask, 
                            max_length=512, 
                            num_beams=10,
                            repetition_penalty=2.5, 
                            length_penalty=2.0, 
                            early_stopping=True
                            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%10==0:
                console.print(f'Completed {_}')
            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

import stanza

# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

In [None]:
# Import stanza
from stanza.server import CoreNLPClient


def coreNLP_tokenizer(inputDocsList):
    tokenizedDocsList = []
    with CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split(), memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
        for d in inputDocsList:
            ann = client.annotate(d)
            sentence = ann.sentence[0]
            tokenizedDocsList.append(' '.join([token.word.lower() for token in sentence.token]))
        return tokenizedDocsList


def test(model, tokenizer, test_file_path, result_file_name, production=False):
    df = pd.read_csv(test_file_path)
    df.dropna()
    df['source'] = df['source'].astype(str)
    df['target'] = df['target'].astype(str)
    df = df.iloc[:1000,:]
    score = generate_summaries(df, rouge_metric, model, tokenizer, result_file_name=result_file_name)
    return score

In [None]:
def T5Trainer(dataframe, source_text, target_text, test_file_path, result_file_name, model_params, output_dir="./models/" ):

    """
    T5 trainer

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = AutoModelForSeq2SeqLM.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text,target_text]]
    display_df(dataframe.head(2))


    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    train_size = 0.8
    train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
    val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VAL Dataset: {val_dataset.shape}\n")


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': model_params["TRAIN_BATCH_SIZE"],
        'shuffle': True,
        'num_workers': 0
        }
    val_params = {
        'batch_size': model_params["VALID_BATCH_SIZE"],
        'shuffle': False,
        'num_workers': 0
        }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


    # Training loop
    console.log(f'[Initiating Fine Tuning]...\n')

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)
      
    console.log(f"[Saving Model]...\n")
    #Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    # evaluating test dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
    console.save_text(os.path.join(output_dir,'logs.txt'))
  
    console.log(f"[Validation Completed.]\n")
    console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
    console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

    console.log(f"[Now onto testing]\n")
    score = test(model, tokenizer, test_file_path, result_file_name)
    return model, score

In [None]:
model_params={
    "MODEL":"google/flan-t5-base", # model_type
    "TRAIN_BATCH_SIZE":3,          # training batch size
    "VALID_BATCH_SIZE":3,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":2e-5,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":120,   # max length of target text
    "SEED": 200                     # set seed for reproducibility 
    }

In [None]:
output_dir = "../outputs/"
test_file_path = data_dir + "setup8_testing.csv"
result_file_name = output_dir + "flan_t5_fine_tuned_setup8_may_8_testing.csv"
#model, score = T5Trainer(dataframe=df, source_text="source", target_text="target", test_file_path= test_file_path,result_file_name=result_file_name, model_params=model_params, output_dir=output_dir)
# tokenzier for encoding the text
tokenizer = AutoTokenizer.from_pretrained(output_dir + "model_files")

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir + "model_files")
model = model.to(device)
score = test(model, tokenizer, test_file_path, result_file_name)
print(score)
# with open('../rouge_score_setup8_flan_t5_fine_tune.txt', 'w') as f:
#     score = test(model, tokenizer, test_file_path, result_file_name)
#     f.write(str(score))

In [None]:
print(score)