In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base").to(device)

In [2]:
device

'cuda'

In [3]:
import pandas as pd
df = pd.read_csv('/teamspace/studios/this_studio/NLP_project/openai_cnndm_final.csv', sep=';')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6291 entries, 0 to 6290
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  6291 non-null   int64 
 1   Unnamed: 0    6291 non-null   int64 
 2   user_id       6291 non-null   object
 3   user_profile  6291 non-null   object
 4   doc_id        6291 non-null   object
 5   post/article  6291 non-null   object
 6   summary_text  6291 non-null   object
 7   confidence    6291 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 393.3+ KB


In [5]:
tokenizer, model

(T5TokenizerFast(name_or_path='google-t5/t5-base', vocab_size=32100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<ext

In [6]:
data = df[['doc_id', 'post/article']]

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6291 entries, 0 to 6290
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   doc_id        6291 non-null   object
 1   post/article  6291 non-null   object
dtypes: object(2)
memory usage: 98.4+ KB


In [8]:
mapp = {}

for idx , row in data.iterrows():
    doc_id = row['doc_id']
    text = row['post/article']

    if doc_id not in mapp:
        mapp[doc_id] = text

In [9]:
len(mapp)

639

In [10]:
ids = list(mapp.keys())
len(ids)

639

In [11]:
unique_texts = list(mapp.values())
len(unique_texts)

639

In [98]:
text = "summarize: " + df.iloc[15]['post/article']

In [99]:
inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)

# Generate the summary
summary_ids = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode the generated tokens
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

summary

'fast food giant has caused fury after installing metal spikes outside its branch in Leeds city centre. critics say they are there to stop people sleeping rough and are uncomfortable to sleep on. 70,000 people across the globe have signed a petition to have them removed.'

In [88]:
len(summary)

206

In [91]:
batch_size = 5

def translate_batch(texts):
    prefixed_texts = ["summarize: " + text for text in texts]
    inputs = tokenizer(prefixed_texts, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)

    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    return summaries

In [101]:
src_texts = unique_texts
len(src_texts)

639

In [102]:
results = []

In [103]:
for i in range(0, len(src_texts), batch_size):
    batch_texts = src_texts[i:i + batch_size]
    translated_texts = translate_batch(batch_texts)
    results.extend(translated_texts)

In [105]:
len(results)

639

In [106]:
gen_summaries = results

In [107]:
mapp = dict(zip(ids, gen_summaries))

In [108]:
len(mapp)

639

In [109]:
df['t5_model_summary'] = df['doc_id'].map(mapp)

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6291 entries, 0 to 6290
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0.1      6291 non-null   int64 
 1   Unnamed: 0        6291 non-null   int64 
 2   user_id           6291 non-null   object
 3   user_profile      6291 non-null   object
 4   doc_id            6291 non-null   object
 5   post/article      6291 non-null   object
 6   summary_text      6291 non-null   object
 7   confidence        6291 non-null   int64 
 8   t5_model_summary  6291 non-null   object
dtypes: int64(3), object(6)
memory usage: 442.5+ KB


In [111]:
df.to_csv('openai_cnndm_final_t5.csv', sep=';')