In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline, AutoModel, AutoTokenizer, T5ForQuestionAnswering


Model Setup

In [2]:
# Choose a model to work
model_name = "t5-small"  # You can choose other versions like 't5-base', 't5-large'

# Choose the type of model it will be used for or AutoModel
model_sum = T5ForConditionalGeneration.from_pretrained(model_name)
model_qa = T5ForQuestionAnswering.from_pretrained(model_name)

# Chose the tokenizer or AutoTokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name)

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-small and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Load Data

In [8]:
# Read the data from .dat
with open('../Datasets/Medical_Documents/test.dat', 'r') as file:
    lines = file.readlines()


In [4]:
# Load data from.csv
import pandas as pd
df = pd.read_csv("../Datasets/Symptoms/symptom_Description.csv")

## Summarize

In [11]:
# Create a pipline for a summarization with the predifined summarization model 
t5_summarizer = pipeline("summarization", model=model_sum, tokenizer=tokenizer)

# Choose a paragraph, the length of tokens can be used to include a maximum level of tokens for the summarization
text = lines[2]
text_len = len(text.split())

# The model can be use as a standard or parametres can be used during the usage
#summary = t5_summarizer(text, max_length=round(text_len/2), min_length=round(text_len/), length_penalty=2.0, num_beams=4, early_stopping=True)
summary = t5_summarizer(text)
print(summary[0])

Your max_length is set to 200, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


{'summary_text': 'oral and topical steroids were used to induce regression in an inflammatory, obstructing endobronchial polyp caused by a retained foreign body . the FB (a peanut half) was then able to be easily and bloodlessly retrieved with fiberoptic bronchoscopy .'}


## Question Answering

In [12]:
# Create a pipeline for the model
question_answerer = pipeline("question-answering", model="t5-small")

# Setup the context and question
context = lines[0]
question = "What is the problem?"

# Use the model 
answer = question_answerer(question=question, context=context)
print(answer)

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-small and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'score': 0.00011141052527818829, 'start': 32, 'end': 74, 'answer': 'reviewed the clinical files of 10 patients'}


## Text Generation

In [6]:
# Build pipeline for text-generation 
text_generator = pipeline("text2text-generation", model="Google/flan-t5-base")

# build a question 
question = "Write a poem about a forest:"

# Use model 
generated_text = text_generator(question, max_length=50)
print(generated_text)

[{'generated_text': 'i love the forest i love the forest i love the forest i love the forest i love the forest i love the forest i love the forest i love the forest i love the forest i love the'}]


Same but without piepline interface

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

input_text = "Write a poem about trees:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad>i love the trees i love the trees i love the trees i love the
