In [None]:
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [1]:
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import os
import random
import time
import warnings
import zipfile

import evaluate
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import parquet
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertConfig, BertTokenizer
from unidecode import unidecode

from src.features.functions_preprocessing import (
    plot_text_length_distribution,
    preprocess_articles,
    preprocess_summaries,
)
from src.features.tokenization import parallel_tokenize
from src.models.bert import BertSummary
from src.models.rnn_encoder_decoder import Encoder, Decoder, Seq2Seq
from src.models.transformer import Transformer

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [3]:
def get_allowed_cpu_count():
    # Returns the number of CPU cores available for this process.
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count() or 1


cpu_count = get_allowed_cpu_count()
print(cpu_count)

104


In [4]:
n_process = max(1, cpu_count // 2)

In [5]:
torch.set_num_threads(n_process)

# **Kaggle dataset**

In [None]:
!kaggle datasets download -d sbhatti/news-summarization

In [None]:
with zipfile.ZipFile("news-summarization.zip", "r") as zip_ref:
    zip_ref.extractall("news-summarization")

In [None]:
news_data = pd.read_csv("news-summarization/data.csv")

In [None]:
news_data.head()

In [None]:
N = random.randint(1, len(news_data))

print(news_data["Content"][N])
print()
print(news_data["Summary"][N])

In [None]:
lengths_article = news_data["Content"].str.len()
lengths_article.describe()

In [None]:
news_data = news_data[
    (lengths_article >= lengths_article.quantile(0.10))
    & (lengths_article <= lengths_article.quantile(0.90))
]

In [None]:
plot_text_length_distribution(news_data, "Content")

In [None]:
lengths_summary = news_data["Summary"].str.len()
lengths_summary.describe()

In [None]:
news_data = news_data[
    (lengths_summary >= lengths_summary.quantile(0.10))
    & (lengths_summary <= lengths_summary.quantile(0.90))
]

In [None]:
news_data["Summary"].str.len().describe()

In [None]:
plot_text_length_distribution(news_data, "Summary")

In [None]:
len(news_data)

In [None]:
# news_data.loc[:, "Content"] = preprocess_articles(
#     news_data["Content"].tolist(), n_process=n_process, batch_size=32
# )
# news_data.loc[:, "Summary"] = preprocess_summaries(
#     news_data["Summary"].tolist(), n_process=n_process, batch_size=32
# )

In [None]:
# news_data.to_parquet("news_data_cleaned.parquet", index=False)

In [6]:
news_data = pd.read_parquet("news_data_cleaned.parquet")

# **Tokenization**

In [7]:
data_copy = news_data[:]
data_copy = news_data.sample(frac=1, random_state=42)

train_ratio = 0.8
train_size = int(train_ratio * len(data_copy))

# Slice the dataset
train_data = data_copy[:train_size]
test_data = data_copy[train_size:]

print(f"Train size: {len(train_data)}")
print(f"Test size:  {len(test_data)}")

Train size: 446425
Test size:  111607


In [None]:
if __name__ == "__main__":
    texts_content = list(train_data["Content"])
    print("Tokenizing Content...")
    tokenized_articles = parallel_tokenize(
        texts_content,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_articles.shape =", tokenized_articles.shape)
    torch.save(tokenized_articles, "tokenized_articles.pt")

Tokenizing Content...


In [8]:
if __name__ == "__main__":
    texts_summary = list(train_data["Summary"])
    print("Tokenizing Summaries...")
    tokenized_summaries = parallel_tokenize(
        texts_summary,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=129,
    )
    print("tokenized_summaries.shape =", tokenized_summaries.shape)
    torch.save(tokenized_summaries, "tokenized_summaries.pt")

Tokenizing Summaries...
tokenized_summaries.shape = torch.Size([446425, 129])


In [15]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tokenized_articles = torch.load("tokenized_articles.pt")
    tokenized_summaries = torch.load("tokenized_summaries.pt")

In [16]:
article_ids = tokenized_articles.long()
summary_ids = tokenized_summaries.long()

# **RNN**

In [30]:
batch_size = 32

dataset = TensorDataset(tokenized_articles, tokenized_summaries)
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [31]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [32]:
encoder = Encoder(
    vocab_size=tokenizer.vocab_size,
    embed_dim=128,
    hidden_size=128,
    num_layers=2,
    dropout_prob=0.1,
)
decoder = Decoder(
    vocab_size=tokenizer.vocab_size,
    embed_dim=128,
    hidden_size=128,
    num_layers=2,
    dropout_prob=0.1,
)

modelSeq2Seq = Seq2Seq(encoder, decoder, device).to(device)

In [33]:
num_epochs = 5
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(modelSeq2Seq.parameters(), lr=2e-4)

total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()

    modelSeq2Seq.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):
        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = modelSeq2Seq(
            input_batch.long(), summary_batch, teacher_forcing_ratio=0.5
        )

        # The model’s outputs shape:
        shifted_target = summary_batch[:, 1:]

        # Compute loss
        loss = loss_fn(
            outputs.reshape(-1, outputs.shape[-1]), shifted_target.reshape(-1)
        )
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        if step % 5000 == 0:
            print(f"Epoch: {epoch+1}, Step: {step}, Loss: {loss.item():.4f}")

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Calculate epoch duration
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    # Print epoch stats
    print(
        f"Epoch {epoch+1}/{num_epochs} - "
        f"Average Loss: {avg_loss:.4f} - "
        f"Time: {epoch_duration:.2f}s"
    )
    torch.save(
        modelSeq2Seq.state_dict(),
        f"model_weights/seq2seq_weights_{epoch+1}_epochs.pth",
    )

total_end_time = time.time()
total_training_time = total_end_time - total_start_time
print(f"Total training time: {total_training_time:.2f}s")

Epoch: 1, Step: 0, Loss: 10.4950
Epoch: 1, Step: 5000, Loss: 6.6238
Epoch: 1, Step: 10000, Loss: 6.5657
Epoch 1/5 - Average Loss: 6.6579 - Time: 24815.32s
Epoch: 2, Step: 0, Loss: 6.4132
Epoch: 2, Step: 5000, Loss: 6.5377


KeyboardInterrupt: 

# **Transformer**

In [17]:
batch_size = 32

dataset = TensorDataset(tokenized_articles, tokenized_summaries)
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [19]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=tokenizer.vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=512,
    ffn_hidden=128,
    n_layers=3,
)

In [21]:
num_epochs = 25

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(modelTransformer.parameters(), lr=2e-4)
modelTransformer = modelTransformer.to(device)

total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()

    modelTransformer.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):
        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = modelTransformer(input_batch.long(), summary_batch[:, :-1])

        # Shift the target by one for the loss
        summary_batch = summary_batch[:, 1:]

        # Compute loss
        loss = loss_fn(
            outputs.reshape(-1, outputs.shape[-1]), summary_batch.reshape(-1)
        )
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        if step % 5000 == 0:
            print(f"Epoch: {epoch+1}, Step: {step}, Loss: {loss.item():.4f}")

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Calculate epoch duration
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    # Print epoch stats
    print(
        f"Epoch {epoch+1}/{num_epochs} - "
        f"Average Loss: {avg_loss:.4f} - "
        f"Time: {epoch_duration:.2f}s"
    )
    torch.save(
        modelTransformer.state_dict(),
        f"model_weights/transformer_weights_{epoch+1}_epochs.pth",
    )

# Calculate total training time
total_end_time = time.time()
total_training_time = total_end_time - total_start_time
print(f"Total training time: {total_training_time:.2f}s")

Epoch: 1, Step: 0, Loss: 8.7347
Epoch: 1, Step: 5000, Loss: 6.0884
Epoch: 1, Step: 10000, Loss: 5.5674
Epoch 1/25 - Average Loss: 5.8770 - Time: 4494.64s
Epoch: 2, Step: 0, Loss: 5.5321
Epoch: 2, Step: 5000, Loss: 5.0852
Epoch: 2, Step: 10000, Loss: 5.0708
Epoch 2/25 - Average Loss: 5.0969 - Time: 4484.52s
Epoch: 3, Step: 0, Loss: 5.1591
Epoch: 3, Step: 5000, Loss: 5.0886
Epoch: 3, Step: 10000, Loss: 4.8430
Epoch 3/25 - Average Loss: 4.8291 - Time: 4482.74s
Epoch: 4, Step: 0, Loss: 4.7744
Epoch: 4, Step: 5000, Loss: 4.4929
Epoch: 4, Step: 10000, Loss: 4.8716
Epoch 4/25 - Average Loss: 4.6679 - Time: 4482.84s
Epoch: 5, Step: 0, Loss: 4.3003
Epoch: 5, Step: 5000, Loss: 4.5971
Epoch: 5, Step: 10000, Loss: 4.7065
Epoch 5/25 - Average Loss: 4.5532 - Time: 4483.41s
Epoch: 6, Step: 0, Loss: 4.5081
Epoch: 6, Step: 5000, Loss: 4.3619
Epoch: 6, Step: 10000, Loss: 4.5513
Epoch 6/25 - Average Loss: 4.4645 - Time: 4483.30s
Epoch: 7, Step: 0, Loss: 4.2583
Epoch: 7, Step: 5000, Loss: 4.5234
Epoch: 7,

In [23]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=tokenizer.vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=128,
    ffn_hidden=128,
    n_layers=3,
)
modelTransformer.load_state_dict(
    torch.load("model_weights/transformer_weights_25_epochs.pth")
)
modelTransformer.eval()

  torch.load("model_weights/transformer_weights_25_epochs.pth")


Transformer(
  (enc_embedding): TransformerEmbedding(
    (tok_emb): Embedding(30522, 128, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (dec_embedding): TransformerEmbedding(
    (tok_emb): Embedding(30522, 128, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-2): 3 x EncoderLayer(
      (attention): AttentionLayer(
        (w_q): Linear(in_features=128, out_features=128, bias=True)
        (w_k): Linear(in_features=128, out_features=128, bias=True)
        (w_v): Linear(in_features=128, out_features=128, bias=True)
        (w_o): Linear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (linear1): Linear(in_features=128, out_features=128, bias=True)
      (linear2): Linear(in_features=128, out_features=128, bias=Tr

# **BERT model**

In [8]:
batch_size = 8

dataset = TensorDataset(tokenized_articles, tokenized_summaries[:, 1:])
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
config = BertConfig(
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    vocab_size=tokenizer.vocab_size,
)

In [11]:
modelBert = BertSummary(config)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(modelBert.parameters(), lr=1e-5)
modelBert.to(device)

num_epochs = 1

total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()

    modelBert.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):
        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = modelBert(input_batch, attention_mask=input_batch.ne(0))

        # Compute loss
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), summary_batch.view(-1))
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        if step % 1000 == 0:
            print(f"Epoch: {epoch+1}, Step: {step}, Loss: {loss.item():.4f}")

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Measure epoch time
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    print(
        f"Epoch {epoch+1}/{num_epochs} - "
        f"Average Loss: {avg_loss:.4f} - "
        f"Time: {epoch_duration:.2f}s"
    )
    torch.save(
        modelBert.state_dict(), f"model_weights/bert_weights_{epoch+1}_epochs.pth"
    )

# Measure total training time
total_end_time = time.time()
total_training_time = total_end_time - total_start_time
print(f"Total training time: {total_training_time:.2f}s")

Epoch: 1, Step: 0, Loss: 10.3945
Epoch: 1, Step: 1000, Loss: 7.4132
Epoch: 1, Step: 2000, Loss: 7.2836
Epoch: 1, Step: 3000, Loss: 7.3756
Epoch: 1, Step: 4000, Loss: 7.2549
Epoch: 1, Step: 5000, Loss: 6.9630
Epoch: 1, Step: 6000, Loss: 6.9224
Epoch: 1, Step: 7000, Loss: 7.1673
Epoch: 1, Step: 8000, Loss: 7.2004
Epoch: 1, Step: 9000, Loss: 7.1884
Epoch: 1, Step: 10000, Loss: 6.8686
Epoch: 1, Step: 11000, Loss: 7.0304
Epoch: 1, Step: 12000, Loss: 7.0979
Epoch: 1, Step: 13000, Loss: 6.9227
Epoch: 1, Step: 14000, Loss: 6.8044
Epoch: 1, Step: 15000, Loss: 6.7914
Epoch: 1, Step: 16000, Loss: 6.9166
Epoch: 1, Step: 17000, Loss: 6.7282
Epoch: 1, Step: 18000, Loss: 6.7682
Epoch: 1, Step: 19000, Loss: 6.8237
Epoch: 1, Step: 20000, Loss: 6.9106
Epoch: 1, Step: 21000, Loss: 6.9015
Epoch: 1, Step: 22000, Loss: 6.7871
Epoch: 1, Step: 23000, Loss: 6.6100
Epoch: 1, Step: 24000, Loss: 6.8327
Epoch: 1, Step: 25000, Loss: 6.6020
Epoch: 1, Step: 26000, Loss: 6.7958
Epoch: 1, Step: 27000, Loss: 6.6092
Epoc

KeyboardInterrupt: 

In [None]:
modelBert = BertSummary(config)
modelBert.load_state_dict(torch.load("model_weights/bert_weights_1epochs.pth"))
modelBert.eval()

# **Evaluation**

In [13]:
rouge = evaluate.load("rouge")

In [24]:
input_text = news_data["Content"][1000]
summary = news_data["Summary"][1000]
print(summary)

he is instantly recognisable both from his appearance - the beard and the military fatigues - and from his first name alone : fidel .


In [26]:
tokenized_input = tokenizer.encode_plus(
    input_text,
    max_length=512,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
)["input_ids"].long()

tokenized_summary = tokenizer.encode_plus(
    summary, max_length=128, truncation=True, padding="max_length", return_tensors="pt"
)["input_ids"].long()

## Transformer

In [27]:
start_summary = torch.zeros((1, 128))
start_summary[0, 0] = 101

for k in range(0, 127):
    output = modelTransformer.to(device)(
        tokenized_input.long().to(device), start_summary.long().to(device)
    )
    start_summary[:, k + 1] = output.argmax(dim=-1)[:, k].detach()
    if start_summary[:, k + 1].item() == 102:
        break
print(tokenizer.decode(start_summary[0].long(), skip_special_tokens=True))

cuba's president has been in the midst of a " revolutionary war ", cuba's president, fidel castro, cuba's cuba, cuba, cuba and cuba.
