In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration, MBartTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

c:\Anaconda\envs\Cuda\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Anaconda\envs\Cuda\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
DF = pd.read_json('../Datasets/AIC Val/ds.jsonl', lines=True)
DF.head()
# DF = pd.read_csv('../Datasets\WikiHow\wikiHow.csv', nrows=2000)
# DF.columns = ['summary', 'paragraph']
# DF

Unnamed: 0,example_id,paragraph,summary
0,0,وتحت عنوان من الكارثة إلى التحدى يبدأ الكاتب ع...,يبدأ الكاتب عرض الكتاب الرابع تحت عنوان من الك...
1,1,ولم يعترف دبلوماسيو هاتين الدولتين بالعريضة ال...,دبلوماسيو الدولتين لم يعترفوا بالعريضة التي قا...
2,2,قامت ولاية حلب بعد اعلان الجنرال الفرنسي هنري ...,أعلن غورو الانتداب الفرنسي على سوريا لكي يعاقب...
3,3,دولة مصر العربيه هي ليست اي دوله وليست اي شعب ...,مصر هي أم البلاد، وقائدة العرب؛ فهي أرض بلاد ا...
4,4,السوريون يصرون على استقلال بلادهم : و مثلما رف...,الشعب السوري يصر على استقلال بلدهم من السيطرة ...


In [4]:
Paragraphs = DF['paragraph'].tolist()
Summaries = DF['summary'].tolist()

In [5]:
Train_Paragraphs, Test_Paragraphs, Train_Summaries, Test_Summaries = train_test_split(Paragraphs, Summaries, test_size=0.2, random_state=42)
Train_Paragraphs, Validation_Paragraphs, Train_Summaries, Validation_Summaries = train_test_split(Train_Paragraphs, Train_Summaries, test_size=0.2, random_state=42)

In [6]:
Model_Name = "facebook/mbart-large-50"
Tokenizer = MBartTokenizer.from_pretrained(Model_Name)
Model = MBartForConditionalGeneration.from_pretrained(Model_Name).to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
Paragraph_Max_Length = 512
Train_Paragraph_Encodings = Tokenizer(Train_Paragraphs, truncation=True, max_length=Paragraph_Max_Length, padding="max_length", return_tensors="pt").to(device)
Validation_Paragraph_Encodings = Tokenizer(Validation_Paragraphs, truncation=True, max_length=Paragraph_Max_Length, padding="max_length", return_tensors="pt").to(device)
Test_Paragraph_Encodings = Tokenizer(Test_Paragraphs, truncation=True, max_length=Paragraph_Max_Length, padding="max_length", return_tensors="pt").to(device)

Summary_Max_Length = 64
Train_Summary_Encodings = Tokenizer(Train_Summaries, truncation=True, max_length=Summary_Max_Length, padding="max_length", return_tensors="pt").to(device)
Validation_Summary_Encodings = Tokenizer(Validation_Summaries, truncation=True, max_length=Summary_Max_Length, padding="max_length", return_tensors="pt").to(device)
Test_Summary_Encodings = Tokenizer(Test_Summaries, truncation=True, max_length=Summary_Max_Length, padding="max_length", return_tensors="pt").to(device)

In [8]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        self.input_ids = input_ids.to(device)
        self.attention_mask = attention_mask.to(device)
        self.decoder_input_ids = decoder_input_ids.to(device)
        self.decoder_attention_mask = decoder_attention_mask.to(device)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "decoder_input_ids": self.decoder_input_ids[idx],
            "decoder_attention_mask": self.decoder_attention_mask[idx]
        }

In [9]:
Train_Dataset = CustomDataset(
    Train_Paragraph_Encodings['input_ids'],
    Train_Paragraph_Encodings['attention_mask'],
    Train_Summary_Encodings['input_ids'],
    Train_Summary_Encodings['attention_mask']
)

Validation_Dataset = CustomDataset(
    Validation_Paragraph_Encodings['input_ids'],
    Validation_Paragraph_Encodings['attention_mask'],
    Validation_Summary_Encodings['input_ids'],
    Validation_Summary_Encodings['attention_mask']
)

Test_Dataset = CustomDataset(
    Test_Paragraph_Encodings['input_ids'],
    Test_Paragraph_Encodings['attention_mask'],
    Test_Summary_Encodings['input_ids'],
    Test_Summary_Encodings['attention_mask']
)

In [10]:
Train_Dataloader = DataLoader(Train_Dataset, batch_size=1)
Validation_Dataloader = DataLoader(Validation_Dataset, batch_size=1)
Test_Dataloader = DataLoader(Test_Dataset, batch_size=1)

In [11]:
Scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

In [12]:
Optimizer = torch.optim.AdamW(Model.parameters(), lr=1e-5)
Criterion = torch.nn.CrossEntropyLoss().to(device)

Epochs_Number = 5

for Epoch in range(Epochs_Number):
    Model.train()
    Total_Loss = 0

    Total_Rouge1 = 0.0
    Total_Rouge2 = 0.0
    Total_RougeL = 0.0
    Total_Batches = 0
    with tqdm(Train_Dataloader, desc=f"Epoch {Epoch + 1}/{Epochs_Number}",  unit="batch") as t:
        for Batch in t:
            Input_IDs = Batch["input_ids"].to(device)
            Attention_Mask = Batch["attention_mask"].to(device)
            Decoder_Input_IDs = Batch["decoder_input_ids"].to(device)
            Decoder_Attention_Mask = Batch["decoder_attention_mask"].to(device)

            Outputs = Model(
                input_ids=Input_IDs,
                attention_mask=Attention_Mask,
                decoder_input_ids=Decoder_Input_IDs,
                decoder_attention_mask=Decoder_Attention_Mask,
                labels=Decoder_Input_IDs
            )

            # IDs = Tokenizer.batch_decode(Input_IDs, skip_special_tokens=True)
            # decoded_output = Tokenizer.batch_decode(Outputs.logits.argmax(dim=-1), skip_special_tokens=True)
            # print(IDs)
            # print(decoded_output)

            Loss = Outputs.loss
            Total_Loss += Loss.item()

            Optimizer.zero_grad()
            Loss.backward()
            Optimizer.step()

            Reference_sentences = Tokenizer.batch_decode(Decoder_Input_IDs, skip_special_tokens=True)
            Generated_sentences = Tokenizer.batch_decode(Outputs.logits.argmax(dim=-1), skip_special_tokens=True)

            Scores = Scorer.score(''.join(Reference_sentences), ''.join(Generated_sentences))
            Total_Rouge1 += Scores['rouge1'].fmeasure
            Total_Rouge2 += Scores['rouge2'].fmeasure
            Total_RougeL += Scores['rougeL'].fmeasure
            Total_Batches += 1

            t.set_postfix(
                Loss_Average=Total_Loss / len(Train_Dataloader),
                loss=Loss.item(),
                Rouge1_Average=Total_Rouge1 / Total_Batches,
                Rouge2_Average=Total_Rouge2 / Total_Batches,
                RougeL_Average=Total_RougeL / Total_Batches
            )

Epoch 1/5:   0%|          | 0/98 [00:00<?, ?batch/s]

Epoch 1/5:   6%|▌         | 6/98 [00:41<10:36,  6.92s/batch, Loss_Average=0.496, Rouge1_Average=0, Rouge2_Average=0, RougeL_Average=0, loss=7.07]


KeyboardInterrupt: 

In [12]:
Model_Save_Path = 'mBart.pt'

In [None]:
torch.save(Model.state_dict(), Model_Save_Path)

In [13]:
Model.load_state_dict(torch.load(Model_Save_Path, map_location=torch.device(device)))

<All keys matched successfully>

In [14]:
Sentence_Transformer_Model = SentenceTransformer('distiluse-base-multilingual-cased')

In [27]:
Total_Loss = 0.0
Total_Rouge1 = 0.0
Total_Rouge2 = 0.0
Total_RougeL = 0.0
Total_Similarity = 0.0
Total_Variance = 0.0
Similarity_Average = 0
Variance_Average = 0
Total_Batches = 0
with tqdm(Test_Dataloader, desc="Testing: ",  unit="batch") as t:
    for Batch in t:
        Input_IDs = Batch["input_ids"].to(device)
        Attention_Mask = Batch["attention_mask"].to(device)
        Decoder_Input_IDs = Batch["decoder_input_ids"].to(device)
        Decoder_Attention_Mask = Batch["decoder_attention_mask"].to(device)


        Outputs = Model(
            input_ids=Input_IDs,
            attention_mask=Attention_Mask,
            labels=Decoder_Input_IDs
        )

        Input_Text = Tokenizer.batch_decode(Input_IDs, skip_special_tokens=True)
        Generated_sentences = Tokenizer.batch_decode(Outputs.logits.argmax(dim=-1), skip_special_tokens=True)
        # print(Input_Text)
        # print(Generated_sentences)

        Loss = Outputs.loss
        Total_Loss += Loss.item()

        Loss.backward()

        Reference_sentences = Tokenizer.batch_decode(Decoder_Input_IDs, skip_special_tokens=True)

        Scores = Scorer.score(''.join(Reference_sentences), ''.join(Generated_sentences))
        Total_Rouge1 += Scores['rouge1'].fmeasure
        Total_Rouge2 += Scores['rouge2'].fmeasure
        Total_RougeL += Scores['rougeL'].fmeasure
        Total_Batches += 1


        Embeddings = Sentence_Transformer_Model.encode([str(Input_Text), str(Generated_sentences)], convert_to_tensor=True)
        Cos_Sim = util.pytorch_cos_sim(Embeddings[0], Embeddings[1])
        Cos_Sim_Value = Cos_Sim.item()
        Total_Similarity += Cos_Sim_Value
        Similarity_Average = Total_Similarity / Total_Batches

        t.set_postfix(
            Loss_Average=Total_Loss / len(Train_Dataloader),
            loss=Loss.item(),
            Rouge1_Average=Total_Rouge1 / Total_Batches,
            Rouge2_Average=Total_Rouge2 / Total_Batches,
            RougeL_Average=Total_RougeL / Total_Batches,
            Semantic_Similarity_Average = Similarity_Average,
            Variance_Average = Variance_Average
        )

        Total_Variance += (Cos_Sim_Value - Similarity_Average) ** 2
        Variance_Average = Total_Variance / Total_Batches

Testing: 100%|██████████| 31/31 [01:52<00:00,  3.64s/batch, Loss_Average=7.54, Rouge1_Average=0.226, Rouge2_Average=0.0968, RougeL_Average=0.226, Semantic_Similarity_Average=0.679, Variance_Average=0.0118, loss=23.1] 
