In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Adafactor
from tqdm import tqdm
import torch
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


# Import data

In [2]:
data = pd.read_csv("..\\data\\abcnews-date-text.csv")
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [3]:
data["start"] = [" ".join(row.headline_text.split(" ")[:4]) for _, row in data.iterrows()]

In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data[:20000], test_size=0.1)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18000 entries, 14719 to 13135
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   publish_date   18000 non-null  int64 
 1   headline_text  18000 non-null  object
 2   start          18000 non-null  object
dtypes: int64(1), object(2)
memory usage: 562.5+ KB


In [6]:
data_onion = pd.read_excel("..\\data\\onion_ner.xlsx")
data_onion.head()

Unnamed: 0.1,Unnamed: 0,completion,prompt
0,3,Supporters Aggravated Bernie Sanders Didnt Use...,Bernie Sanders | DNC -->
1,5,Gen Tommy Franks Quits Army To Pursue Solo Bom...,Tommy Franks | Army -->
2,7,Gore Camp Denies Putting Bush Camps Canoe In T...,Gore Camp | Bush -->
3,9,Man At Bar Clinging To Muted King Of Queens Ep...,King Of | Queens -->
4,13,Expansive Obama State Of The Union Speech To T...,Obama | Union | Robert Altman -->


# Import and initialize model

In [25]:
tokenizer_base = T5Tokenizer.from_pretrained('t5-base')
model_base = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True).to("cuda")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nateraw/bert-base-uncased-ag-news")

model = AutoModelForSequenceClassification.from_pretrained("nateraw/bert-base-uncased-ag-news").to("cuda")

Downloading: 100%|██████████| 252/252 [00:00<00:00, 252kB/s]
Downloading: 100%|██████████| 712/712 [00:00<00:00, 525kB/s]
Downloading: 100%|██████████| 226k/226k [00:02<00:00, 95.5kB/s] 
Downloading: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]
Downloading: 100%|██████████| 418M/418M [00:17<00:00, 24.8MB/s] 


# Analyze normal and satirical headlines in Generate function

In [39]:
def generate(text, model, tokenizer):
   input_ids = tokenizer.encode(text, return_tensors="pt").to("cuda")
   outputs = model.generate(input_ids, do_sample=True, max_length=100) # , repetition_penalty=5.0,  top_k=20 select seed (sample_...=True)
   return tokenizer.decode(outputs[0])

In [40]:
generate("Obama", model, tokenizer)

IndexError: too many indices for tensor of dimension 2

In [25]:
def get_probs(sentence, tokenizer, model):
    tokens = tokenizer.encode(sentence)
    l = len(tokens)
    probs = []
    for i in range(1, l):
        gen = model.generate(torch.tensor([tokens[:i]]).to("cuda"), do_sample=True, output_scores=True, return_dict_in_generate=True, max_length=100)
        local_probs = torch.nn.functional.softmax(gen.scores[i][0])
        probs.append(local_probs[tokens[i]].item())
    return probs, tokens

In [26]:
get_probs("This is a test.", tokenizer, model)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  local_probs = torch.nn.functional.softmax(gen.scores[i][0])
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


([0.07269465923309326, 0.0, 0.0, 0.0], [1212, 318, 257, 1332, 13])

In [29]:
data.iloc[20].headline_text

'businesses should prepare for terrorist attacks'

In [31]:
get_probs(data.iloc[20].headline_text, tokenizer, model)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  local_probs = torch.nn.functional.softmax(gen.scores[i][0])
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


([0.0, 0.004618010483682156, 0.0, 0.0, 0.0, 0.0],
 [22680, 274, 815, 8335, 329, 7417, 3434])

In [144]:
data_onion.iloc[0].completion

'Supporters Aggravated Bernie Sanders Didnt Use DNC Speech To Get Voters To Act Against Their Own SelfInterest'

In [158]:
data.iloc[0].headline_text

'aba decides against community broadcasting licence'