# Imports and Data Loading

In [1]:
# the notebook's main objective is to filter and prepare the dataset to train a summarizer on it.
import os, sys
from pathlib import Path
HOME = os.getcwd()

current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent


PARENT_DIR = str(current)
DATA_FOLDER = os.path.join(PARENT_DIR, 'src', 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')
sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))
sys.path.append(os.path.join(str(current), 'models'))


In [2]:
from datasets import load_dataset
all_data = load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'all_data_processed.csv'), split='train')
all_data = all_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))

FileNotFoundError: Unable to find 'c:\Users\m4mou\Desktop\pmldl\textDetoxification-main\src\data\all_data_processed.csv'

In [43]:
all_data

Dataset({
    features: ['source', 'target'],
    num_rows: 597519
})

In [44]:
from src.text_processing import preprocess as pr
# the next step is to filter the dataset
def filter_data(sample):
    """This function receives  a batch of samples from the original data and filters those whose 'source' text is shorter than the 'target' text."""
    # first tokenize each 'source' and 'target' fields
    source = pr.tokenize(sample['source'], tokenizer_type='word')
    target = pr.tokenize(sample['target'], tokenizer_type='word')
    return len(source) > len(target)

summary_data = all_data.filter(filter_data)
# save the data
summary_data.to_csv(os.path.join(DATA_FOLDER, 'summarized_data.csv'), index=False)

Creating CSV from Arrow format:   0%|          | 0/277 [00:00<?, ?ba/s]

Creating CSV from Arrow format: 100%|██████████| 277/277 [00:01<00:00, 239.18ba/s]


31406141

In [45]:
import src.data_preparation.prepare_data as pdr
sample = summary_data.select(range(5000))
train_data, val_data, test_data = pdr.data_split(all_data=sample)

# DataLoaders

In [46]:
# the next step is to tokenize the data  
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
CHECKPOINT = 't5-small'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)
MODEL = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT).to(DEVICE)

In [47]:
TASK_PREFIX = 'summarize: '

def prepare_labeled_data(batch):
    # add the task predix to each sentence
    inputs = [TASK_PREFIX + doc for doc in batch["source"]]
    # tokenize 'x'
    model_inputs = TOKENIZER(inputs, truncation=True, max_length=1028)
    # tokenize 'y'  
    labels = TOKENIZER(text_target=batch["target"], truncation=True)
    # add it to the model's input
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [48]:
train_data = train_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
val_data = val_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])

In [49]:
# create a DataCollator for padding for the sequence to sequence models
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=CHECKPOINT)
# we are now ready to create the dataloader
from torch.utils.data import DataLoader
train_dl = DataLoader(dataset=train_data, batch_size=4, shuffle=True, collate_fn=data_collator)
val_dl = DataLoader(dataset=val_data, batch_size=4, shuffle=False, collate_fn=data_collator)

In [50]:
# make sure the data is loaded correctly
b1, b2 = next(iter(train_dl)), next(iter(val_dl))

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Model Training

## Training Code

## Train utilities

In [51]:
from src.evaluation import toxicity_classication as tc
singleton_obj = tc.EvalutionSingletonInitializer()
tx_classifier, tx_tokenizer, tx_device = singleton_obj.get_toxic_classifier(), singleton_obj.get_toxic_tokenizer(), singleton_obj.get_device()
# let's define some of the training parameters
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR

# lr: the same as the one used in the/home/ayhem18/DEV/My_Kaggle_Repo/pytorch_modular/pytorch_utilities.py
optimizer = Adam(MODEL.parameters(), lr=2 * 10 ** -5)
scheduler = LinearLR(optimizer=optimizer, start_factor=1, end_factor=0.5,total_iters=100)

In [52]:
# let's write a function to compute the summarization + toxicity loss
from src.evaluation.toxicity_classication import EvalutionSingletonInitializer
from torch.nn.functional import softmax
from typing import Union

def toxic_summary_model_loss(output_decoded: torch.Tensor, 
                             device,
                             return_tensor: bool=False) -> Union[float, torch.Tensor]:
    
    singleton_obj = EvalutionSingletonInitializer()
    tc_tokenizer, tc_classifier = singleton_obj.get_toxic_tokenizer(), singleton_obj.get_toxic_classifier()

    # make sure to freeze their parameters
    for p in tc_classifier.parameters():
        p.requires_grad = False

    tc_classifier.to(device)
    # tokenize
    model_input = tc_tokenizer(output_decoded, return_tensors='pt', padding=True, truncation=True)
    # set the input to the device
    model_input = {k: v.to(device) for k, v in model_input.items()}
    # pass through the model
    output = tc_classifier(**model_input)
    
    loss = torch.mean(softmax(output.logits, dim=1)[:, 1])
    
    if return_tensor: 
        loss.requires_grad=True
        return loss
    
    return loss.item()


In [53]:
from src.models.summarizer import summarizer  as ss
import src.models/train.exp_tracking as et
import importlib 
importlib.reload(ss)
importlib.reload(et)

_, _, best_model = ss.train_custom_summarizer(train_dataloader=train_dl, 
                                            val_dataloader=val_dl,
                                            summary_model=MODEL,
                                            summary_tokenizer=TOKENIZER, 
                                            toxicity_loss_function=toxic_summary_model_loss,
                                            toxicity_coeff=0.5, 
                                            optimizer=optimizer, 
                                            scheduler=scheduler, 
                                            num_epochs=20,   
                                            report_per_epoch=1,
                                            log_dir=os.path.join(HOME, 'runs')
                                            )


[INFO] Created SummaryWriter, saving to: /home/ayhem18/DEV/TextDetoxification/src/data_analysis/runs/experience_3...


  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:51<16:18, 51.50s/it]

Training losses
train_loss: 2.058954809506734
val_loss: 1.811967327594757
val_toxic_loss: 0.4798521736264229
##################################################



 10%|█         | 2/20 [01:44<15:43, 52.41s/it]

Training losses
train_loss: 1.853737468868494
val_loss: 1.7395953011512757
val_toxic_loss: 0.43860718846321106
##################################################



 15%|█▌        | 3/20 [02:37<14:55, 52.66s/it]

Training losses
train_loss: 1.7814031597971915
val_loss: 1.7094623112678529
val_toxic_loss: 0.4199349391274154
##################################################



 20%|██        | 4/20 [03:32<14:15, 53.46s/it]

Training losses
train_loss: 1.7202352969845136
val_loss: 1.684651005268097
val_toxic_loss: 0.407940112259239
##################################################



 25%|██▌       | 5/20 [04:25<13:23, 53.54s/it]

Training losses
train_loss: 1.6878443964322407
val_loss: 1.6621418690681458
val_toxic_loss: 0.34025811709463594
##################################################



 30%|███       | 6/20 [05:18<12:22, 53.07s/it]

Training losses
train_loss: 1.6435355252275865
val_loss: 1.6507072162628174
val_toxic_loss: 0.3387550112465397
##################################################



 35%|███▌      | 7/20 [06:10<11:25, 52.76s/it]

Training losses
train_loss: 1.6143817332883676
val_loss: 1.6427064394950868
val_toxic_loss: 0.33577101639006285
##################################################



 40%|████      | 8/20 [07:03<10:35, 52.97s/it]

Training losses
train_loss: 1.5854780339697996
val_loss: 1.636137216091156
val_toxic_loss: 0.3092331942776218
##################################################



 45%|████▌     | 9/20 [07:55<09:40, 52.80s/it]

Training losses
train_loss: 1.5574309329191844
val_loss: 1.633069040775299
val_toxic_loss: 0.33350867104250936
##################################################



 50%|█████     | 10/20 [08:48<08:48, 52.84s/it]

Training losses
train_loss: 1.528621304978927
val_loss: 1.6335341763496398
val_toxic_loss: 0.3053720067953691
##################################################



 55%|█████▌    | 11/20 [09:42<07:57, 53.03s/it]

Training losses
train_loss: 1.5046447436511516
val_loss: 1.630578372478485
val_toxic_loss: 0.3123217681562528
##################################################



 60%|██████    | 12/20 [10:34<07:02, 52.79s/it]

Training losses
train_loss: 1.481734464118878
val_loss: 1.6248017978668212
val_toxic_loss: 0.3218809970607981
##################################################



 65%|██████▌   | 13/20 [11:28<06:12, 53.18s/it]

Training losses
train_loss: 1.46661146124204
val_loss: 1.6276494884490966
val_toxic_loss: 0.30002808939199893
##################################################



 70%|███████   | 14/20 [12:21<05:18, 53.08s/it]

Training losses
train_loss: 1.4381910422692696
val_loss: 1.6230375289916992
val_toxic_loss: 0.307290983046405
##################################################



 75%|███████▌  | 15/20 [13:13<04:24, 52.83s/it]

Training losses
train_loss: 1.4177744600673516
val_loss: 1.62771249294281
val_toxic_loss: 0.2944334503309801
##################################################



 80%|████████  | 16/20 [14:06<03:30, 52.70s/it]

Training losses
train_loss: 1.4031429247558116
val_loss: 1.630598373413086
val_toxic_loss: 0.28816035486292096
##################################################



 85%|████████▌ | 17/20 [14:58<02:37, 52.57s/it]

Training losses
train_loss: 1.3840631944686175
val_loss: 1.6306456446647644
val_toxic_loss: 0.2935291335778311
##################################################



 90%|█████████ | 18/20 [15:50<01:44, 52.31s/it]

Training losses
train_loss: 1.3683720229069392
val_loss: 1.6363922333717347
val_toxic_loss: 0.2932124473992735
##################################################



 95%|█████████▌| 19/20 [16:42<00:52, 52.18s/it]

Training losses
train_loss: 1.3503433749824763
val_loss: 1.633616714477539
val_toxic_loss: 0.30257517254445704
##################################################



100%|██████████| 20/20 [17:33<00:00, 52.69s/it]

Training losses
train_loss: 1.3333475915218393
val_loss: 1.6379020452499389
val_toxic_loss: 0.3058235085057095
##################################################






In [54]:
for i in range(0, len(val_data), 20):
    input_ids = val_data[i]['input_ids']
    attention_mask = val_data[i]['attention_mask']
    labels = val_data[i]['labels']

    print(f"source: {TOKENIZER.decode(input_ids, skip_special_tokens=True)}")
    print(f"target: {TOKENIZER.decode(labels, skip_special_tokens=True)}")

    outputs = best_model.generate(
        input_ids=torch.tensor(input_ids).unsqueeze(0).to('cuda'),
        attention_mask=torch.tensor(attention_mask).unsqueeze(0).to('cuda'),
        max_length=512,
        num_beams=5,
        early_stopping=True
    )

    print(f"generated :{TOKENIZER.decode(outputs[0], skip_special_tokens=True)}")
    print("#" * 100)


source: summarize: we don't fear these demons. we destroy them.
target: we don't fight demons, we destroy them.
generated :we don't fear these demons.
####################################################################################################
source: summarize: the old witch shook her head.
target: she shook her head.
generated :the old witch shook her head.
####################################################################################################
source: summarize: you want me to fucking leave her
target: you want to leave it
generated :you want me to leave her
####################################################################################################
source: summarize: why don't you get rid of her before you go, huh
target: why not make amends before you leave, huh
generated :why don't you get rid of her before you go
####################################################################################################
source: summarize: darn it. fractions a