In [None]:
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import parquet
from unidecode import unidecode
import zipfile
import random
from transformers import BertTokenizer, BertConfig
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import torch
from torch.utils.data import DataLoader, TensorDataset
import warnings
import time

from src.features.functions_preprocessing import (
    plot_text_length_distribution,
    preprocess_articles,
    preprocess_summaries,
)
from src.features.tokenization import parallel_tokenize
from src.models.rnn_encoder_decoder import *
from src.models.transformer import Transformer
from src.models.bert import BertSummary

random.seed(42)

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")

In [None]:
def get_allowed_cpu_count():
    # Returns the number of CPU cores available for this process.
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count() or 1


cpu_count = get_allowed_cpu_count()
print(cpu_count)

In [None]:
n_process = max(1, cpu_count // 2)

In [None]:
torch.set_num_threads(n_process)

# **Kaggle dataset**

In [None]:
!kaggle datasets download -d sbhatti/news-summarization

In [None]:
with zipfile.ZipFile("news-summarization.zip", "r") as zip_ref:
    zip_ref.extractall("news-summarization")

In [None]:
news_data = pd.read_csv("news-summarization/data.csv")

In [None]:
news_data.head()

In [None]:
N = random.randint(1, len(news_data))

print(news_data["Content"][N])
print()
print(news_data["Summary"][N])

In [None]:
lengths_article = news_data["Content"].str.len()
lengths_article.describe()

In [None]:
news_data = news_data[
    (lengths_article >= lengths_article.quantile(0.10))
    & (lengths_article <= lengths_article.quantile(0.90))
]

In [None]:
plot_text_length_distribution(news_data, "Content")

In [None]:
lengths_summary = news_data["Summary"].str.len()
lengths_summary.describe()

In [None]:
news_data = news_data[
    (lengths_summary >= lengths_summary.quantile(0.10))
    & (lengths_summary <= lengths_summary.quantile(0.90))
]

In [None]:
news_data["Summary"].str.len().describe()

In [None]:
plot_text_length_distribution(news_data, "Summary")

In [None]:
len(news_data)

In [None]:
# news_data.loc[:, "Content"] = preprocess_articles(
#     news_data["Content"].tolist(), n_process=n_process, batch_size=32
# )
# news_data.loc[:, "Summary"] = preprocess_summaries(
#     news_data["Summary"].tolist(), n_process=n_process, batch_size=32
# )

In [None]:
# news_data.to_parquet("news_data_cleaned.parquet", index=False)

In [None]:
news_data = pd.read_parquet("news_data_cleaned.parquet")

# **Tokenization**

In [None]:
data_copy = news_data[:]
data_copy = news_data.sample(frac=1, random_state=42)

train_ratio = 0.8
train_size = int(train_ratio * len(data_copy))

# Slice the dataset
train_data = data_copy[:train_size]
test_data = data_copy[train_size:]

print(f"Train size: {len(train_data)}")
print(f"Test size:  {len(test_data)}")

In [None]:
if __name__ == "__main__":
    texts_content = list(train_data["Content"])
    print("Tokenizing Content...")
    tokenized_articles = parallel_tokenize(
        texts_content,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_articles.shape =", tokenized_articles.shape)
    torch.save(tokenized_articles, "tokenized_articles.pt")

    texts_summary = list(train_data["Summary"])
    print("Tokenizing Summaries...")
    tokenized_summaries = parallel_tokenize(
        texts_summary,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_summaries.shape =", tokenized_summaries.shape)
    torch.save(tokenized_summaries, "tokenized_summaries.pt")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tokenized_articles = torch.load("tokenized_articles.pt")
    tokenized_summaries = torch.load("tokenized_summaries.pt")

In [None]:
tokenized_articles = tokenized_articles.long()

tokenized_summaries = torch.cat(
    [torch.zeros(tokenized_summaries.size(0), 1), tokenized_summaries], dim=1
).long()

article_ids = tokenized_articles.long()
summary_ids = tokenized_summaries.long()

# **RNN**

# **Transformer**

In [None]:
batch_size = 32

dataset = TensorDataset(tokenized_articles, tokenized_summaries)
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=tokenizer.vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=512,
    ffn_hidden=128,
    n_layers=3,
)

In [None]:
num_epochs = 10

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(modelTransformer.parameters(), lr=5e-4)
modelTransformer = modelTransformer.to(device)

total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()

    modelTransformer.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):
        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = modelTransformer(input_batch.long(), summary_batch[:, :-1])

        # Shift the target by one for the loss
        summary_batch = summary_batch[:, 1:]

        # Compute loss
        loss = loss_fn(
            outputs.reshape(-1, outputs.shape[-1]), summary_batch.reshape(-1)
        )
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        if step % 10000 == 0:
            print(f"Epoch: {epoch+1}, Step: {step}, Loss: {loss.item():.4f}")

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Calculate epoch duration
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    # Print epoch stats
    print(
        f"Epoch {epoch+1}/{num_epochs} - "
        f"Average Loss: {avg_loss:.4f} - "
        f"Time: {epoch_duration:.2f}s"
    )

# Calculate total training time
total_end_time = time.time()
total_training_time = total_end_time - total_start_time
print(f"Total training time: {total_training_time:.2f}s")

In [None]:
torch.save(model.state_dict(), "transformer_weights_50epochs.pth")

In [None]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=tokenizer.vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=512,
    ffn_hidden=128,
    n_layers=3,
)
modelTransformer.load_state_dict(torch.load("transformer_weights_50epochs.pth"))
modelTransformer.eval()

# **BERT model**

In [None]:
batch_size = 8

dataset = TensorDataset(tokenized_articles, tokenized_summaries[:, 1:])
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
config = BertConfig(
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    vocab_size=tokenizer.vocab_size,
)

In [None]:
modelBert = BertSummary(config)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(modelBert.parameters(), lr=1e-5)
modelBert.to(device)

num_epochs = 2

total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()

    modelBert.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):
        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = modelBert(input_batch, attention_mask=input_batch.ne(0))

        # Compute loss
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), summary_batch.view(-1))
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        if step % 1000 == 0:
            print(f"Epoch: {epoch+1}, Step: {step}, Loss: {loss.item():.4f}")

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Measure epoch time
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    print(
        f"Epoch {epoch+1}/{num_epochs} - "
        f"Average Loss: {avg_loss:.4f} - "
        f"Time: {epoch_duration:.2f}s"
    )

# Measure total training time
total_end_time = time.time()
total_training_time = total_end_time - total_start_time
print(f"Total training time: {total_training_time:.2f}s")

In [None]:
torch.save(modelBert.state_dict(), "bert_weights_2epochs.pth")

In [None]:
modelBert = BertSummary(config)
modelBert.load_state_dict(torch.load("transformer_weights_50epochs.pth"))
modelBert.eval()

# **Evaluation**

In [None]:
!pip install

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
input_text = news_data["Content"][1000]

In [None]:
tokenized_input = tokenizer.encode_plus(
    input_text,
    max_length=512,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
)

In [None]:
input_ids = tokenized_input["input_ids"].to(device)
attention_mask = tokenized_input["attention_mask"].to(device)

In [None]:
model.eval()

In [None]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

In [None]:
# Reshape the outputs tensor
reshaped_outputs = outputs

# Get the predicted summary
predicted_summary_ids = torch.argmax(reshaped_outputs, dim=-1)
predicted_summary = tokenizer.decode(predicted_summary_ids[0], skip_special_tokens=True)

print("Predicted Summary:", predicted_summary)