In [1]:
!pip install --quiet transformers
!pip install --quiet sentencepiece
!pip install --quiet datasets
!pip install --quiet rouge_score

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from datasets import load_metric

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# %matplotlib inline
# %config InlineBackend.figure_format='retina'
# sns.set(style='whitegrid', palette='muted', font_scale=1.2)
# rcParams['figure.figsize'] = 16, 10

In [None]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum", use_fast=False)


TypeError: Couldn't build proto file into descriptor pool: duplicate file name sentencepiece_model.proto

In [6]:

model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.

In [4]:
path_input = '../input/headlines/summary.csv'
df = pd.read_csv(path_input, nrows=50000)
df['summary'] = df['summary'].replace(r'\n', '', regex = True)
df.head()

Unnamed: 0,text,summary
0,و هو يعتبر متحف جديد العهد ، يقدم مجموعة من ال...,المتحف العربي للفن الحديث
1,يعتبر فيلاجيو مول من أهم المجمعات التجارية و أ...,فيلاجيو مول
2,هو المنطقة البحرية الممتدة على مجموعة كيلومترا...,كورنيش الدوحة
3,يعتبر هذا المنتجع من أجمل الأماكن الساحلية الم...,منتجع شاطئ سيلين
4,و هو منتجع مطل على البحر ، و يحوي العديد من أم...,منتجع الغارية


In [5]:
# text_token_counts = df['text'].apply(lambda x : len(tokenizer.encode(x)))
# summary_token_counts = df['summary'].apply(lambda x : len(tokenizer.encode(x)))

# fig, (ax1, ax2) = plt.subplots(1, 2)
# sns.boxplot(text_token_counts, ax=ax1)
# ax1.set_title('full text token counts')
# sns.boxplot(summary_token_counts, ax=ax2)
# ax2.set_title('summary text token counts')

In [6]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data=df,
        tokenizer=tokenizer,
        text_max_token_len = 200,
        summary_max_token_len = 12
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['text']

        text_encoding = tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = tokenizer(
            data_row['summary'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        labels = summary_encoding['input_ids']
        labels[labels == tokenizer.pad_token_id] = -100
        
        return dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            decoder_attention_mask=summary_encoding['attention_mask'].flatten()
        )

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = SummaryDataset(data=df_train)
test_dataset = SummaryDataset(data=df_test)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)
eval_dataloader = DataLoader(test_dataset, batch_size=2)

In [8]:
# torch.cuda.empty_cache()

In [9]:
num_epochs = 3

num_training_steps = num_epochs * len(train_dataloader)

optimizer = AdamW(model.parameters())
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)
#         print(predictions)
#         print(batch["labels"])
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        
        optimizer.zero_grad()
        progress_bar.update()
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f'./t5-Arabic.pth')

    print(f'epoch: {epoch + 1} -- loss: {loss}')

  0%|          | 0/60000 [00:00<?, ?it/s]

epoch: 1 -- loss: 3.1258676052093506
epoch: 2 -- loss: 4.5618743896484375
epoch: 3 -- loss: 2.331545352935791


In [10]:
metric= load_metric("rouge")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

{'rouge1': AggregateScore(low=Score(precision=0.347183333333333, recall=0.347183333333333, fmeasure=0.347183333333333), mid=Score(precision=0.35013333333333163, recall=0.35013333333333163, fmeasure=0.35013333333333163), high=Score(precision=0.3531168749999997, recall=0.3531168749999997, fmeasure=0.3531168749999997)),
 'rouge2': AggregateScore(low=Score(precision=0.14787159090908217, recall=0.14787159090908217, fmeasure=0.14787159090908217), mid=Score(precision=0.15066363636362662, recall=0.15066363636362662, fmeasure=0.15066363636362662), high=Score(precision=0.15334568181817262, recall=0.15334568181817262, fmeasure=0.15334568181817262)),
 'rougeL': AggregateScore(low=Score(precision=0.33744166666666625, recall=0.33744166666666625, fmeasure=0.33744166666666625), mid=Score(precision=0.34028333333333294, recall=0.34028333333333294, fmeasure=0.34028333333333294), high=Score(precision=0.3429666666666659, recall=0.3429666666666659, fmeasure=0.3429666666666659)),
 'rougeLsum': AggregateScore

In [11]:
def summarizeText(text, model=model):
    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    generated_ids = model.generate(
        input_ids=text_encoding['input_ids'].to(device),
        attention_mask=text_encoding['attention_mask'].to(device),
        max_length=150,
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )    

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]
    return "".join(preds)

In [12]:
text = """مصر او (رسميا: جمهورية مصر العربية) هي دولة عربية تقع في الركن الشمالي الشرقي من قارة افريقيا، ولديها امتداد اسيوي، حيث تقع شبه جزيرة سيناء داخل قارة اسيا فهي دولة عابرة للقارات، قدر  عربي"""

summary = summarizeText(text, model)

print(summary)

زيت الزيتون و العسل
