In [94]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import random
import torch.optim as optim
import time

In [95]:
# Load the data from the CSV file
df = pd.read_csv("/kaggle/input/news-articles-summary/NewsArticlesSummaryCSV.csv")

In [96]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [97]:
def preprocess_data(df):
    # Tokenize the article and summary
    articles = df[df.columns[0]].tolist()
    summaries = df[df.columns[1]].tolist()

    tokenized_articles = []
    tokenized_summaries = []

    for article, summary in zip(articles, summaries):
        article_tokens = tokenizer(article, truncation=True, padding="max_length", return_tensors="pt")
        summary_tokens = tokenizer(summary, truncation=True, padding="max_length", return_tensors="pt")

        tokenized_articles.append(article_tokens["input_ids"])
        tokenized_summaries.append(summary_tokens["input_ids"])


    return tokenized_articles, tokenized_summaries


In [98]:
# Preprocess data
encoded_articles, encoded_summaries = preprocess_data(df)

In [99]:
# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [100]:
i = 0
# Train the LLM
for epoch in range(10):
    start_time = time.time()
    # Pass the individual tensors to the model
    for tensor_article, tensor_summary in zip(encoded_articles, encoded_summaries):
        outputs = model(input_ids=tensor_article, decoder_input_ids=tensor_summary)
        logits = outputs.logits

        # Loss calculation
        loss = outputs.loss

        # Backward pass
        model.zero_grad()
        optimizer.step()
    i+=1
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"{i}th iteration is over. It took {execution_time} seconds to be executed")

1th iteration is over. It took 118.57974672317505 seconds to be executed
2th iteration is over. It took 119.88300704956055 seconds to be executed
3th iteration is over. It took 119.73744702339172 seconds to be executed
4th iteration is over. It took 116.64661383628845 seconds to be executed
5th iteration is over. It took 117.40102744102478 seconds to be executed
6th iteration is over. It took 116.35431504249573 seconds to be executed
7th iteration is over. It took 116.60191774368286 seconds to be executed
8th iteration is over. It took 118.40112495422363 seconds to be executed
9th iteration is over. It took 119.64539623260498 seconds to be executed
10th iteration is over. It took 120.542649269104 seconds to be executed


In [101]:
# From preprocessed df to list
dataset = []
for _, row in preprocessed_data.iterrows():
    dataset.append(row.to_dict())

In [102]:
dataloader = DataLoader(dataset)

In [103]:
import pickle

In [104]:
# Save the model and optimizer
model_path = "model_news_artile_summarizer_ready.pkl"
with open(model_path, 'wb') as f:
    pickle.dump((model, optimizer), f)


In [105]:
# Save the model weights
model_weights_path = "model_news_artile_summarizer_weights_ready.pt"
torch.save(model.state_dict(), model_weights_path)


In [106]:
# Save the trained model
model.save_pretrained("trained_model")

In [107]:
# Load the trained model
model_uploaded = AutoModelForSeq2SeqLM.from_pretrained("trained_model")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [108]:
# Tokenize the new article
new_article = "The new iPhone 14 Pro has been released, and it features a significantly upgraded camera system, a faster processor, and a redesigned notch."
new_article_tokens = tokenizer(new_article, truncation=True, padding="max_length", return_tensors="pt")

In [109]:
type(new_article_tokens["input_ids"])

torch.Tensor

In [110]:
# Generate the summary
input_ids=new_article_tokens["input_ids"]
attention_mask=new_article_tokens["attention_mask"]

output = model_uploaded.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=100,
)

In [111]:
# Decode the output
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print(decoded_output)

The new iPhone 14 Pro has been released. a faster processor. new camera system.
