# Summarization using hf transformers

## Get API tokens

In [None]:
import os
import urllib.parse

from dotenv import load_dotenv
import requests

In [None]:
load_dotenv("../aws.env")

In [None]:
api_url = urllib.parse.urljoin(os.environ["DBAPI_URL"], os.environ["DBAPI_STAGE"])

In [None]:
login_data = {
    "username": os.environ["FIRST_USER"],
    "password": os.environ["FIRST_USER_PASSWORD"],
}
r = requests.post(f"{api_url}/token", data=login_data)
tokens = r.json()
a_token = tokens["access_token"]
token_headers = {"Authorization": f"Bearer {a_token}"}

## Get a document

In [None]:
r = requests.get(f"{api_url}/documents/ids/?skip=0&limit=1", headers=token_headers)
doc_id = r.json()[0]["id"]

In [None]:
r = requests.get(f"{api_url}/documents/{doc_id}", headers=token_headers)
doc = r.json()

In [None]:
text = doc["parsed_text"]

## Generate summaries

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer, BartTokenizer, BartForConditionalGeneration, BartConfig

### BART

In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [None]:
inputs = tokenizer([text], max_length=1024, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)

In [None]:
[tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]

### T5

In [None]:
t5_model = AutoModelWithLMHead.from_pretrained("t5-base")
t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [None]:
# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512)
outputs = t5_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
t5_tokenizer.convert_tokens_to_string(t5_tokenizer.convert_ids_to_tokens(outputs[0]))