In [None]:
# Importing libraries
import os
import logging
import numpy as np
import pandas as pd
import torch
from torch import cuda
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the mT5 modules from huggingface/transformers
from transformers import T5Tokenizer, MT5Model, MT5Config, MT5ForConditionalGeneration

In [None]:
# Loggger
logging.basicConfig(
    filename = 'finetunning.log', 
    encoding = 'utf-8', 
    level    = logging.INFO, 
    format   = '%(asctime)s :  %(message)s', 
    datefmt  = '%m/%d/%Y %I:%M:%S %p'
)

# set up logging to console
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
# set a format which is simpler for console use
formatter = logging.Formatter(
    '%(asctime)s %(message)s',
    datefmt  = '%m/%d/%Y %I:%M:%S %p'
)
console.setFormatter(formatter)
# add the handler to the root logger
logging.getLogger('').addHandler(console)
logger = logging.getLogger(__name__)

In [None]:
model_params={
    "MODEL":"google/mt5-small",     # model_type: mt5-base/mt5-large
    "TRAIN_BATCH_SIZE":8,           # training batch size
    "VALID_BATCH_SIZE":8,           # validation batch size
    "TRAIN_EPOCHS":2,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":2e-4,           # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,   # max length of source text
    "MAX_TARGET_TEXT_LENGTH":258,   # max length of target text
    "SEED": 42                      # set seed for reproducibility 
}

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(model_params["SEED"])
np.random.seed(model_params["SEED"]) 
torch.backends.cudnn.deterministic = True

In [None]:
class SummaryDataSet(Dataset):
    """
    Dataloader to Finetune a mT5 model focused in a summarization task.

    """
    def __init__(
        self, 
        dataframe: pd.DataFrame, 
        tokenizer: T5Tokenizer, 
        source_len: int, 
        target_len: int, 
        source_text: str, 
        target_text: str,
    ) -> None:
        
        self.tokenizer   = tokenizer
        self.source_len  = source_len
        self.summ_len    = target_len
        self.target_text = dataframe[target_text]
        self.source_text = dataframe[source_text]
    
    def __getitem__(
        self, index: int
    ) -> dict[str: torch.Tensor]:
        
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text], 
            max_length        = self.source_len, 
            pad_to_max_length = True, 
            truncation        = True, 
            padding           = "max_length", 
            return_tensors    = 'pt'
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text], 
            max_length        = self.summ_len, 
            pad_to_max_length = True, 
            truncation        = True, 
            padding           = "max_length", 
            return_tensors    = 'pt'
        )

        source_ids  = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids  = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }
    
    def __len__(self) -> int:
        return len(self.target_text)

In [None]:
def train(
    model: MT5ForConditionalGeneration, 
    tokenizer: T5Tokenizer, 
    epoch: int, 
    loader: SummaryDataSet, 
    optimizer: torch.optim, 
    device: str = 'cuda' if cuda.is_available() else 'cpu'
) -> None:
    """
    Function to be called for training with the parameters passed from main function

    """
    
    model.train()
    for _, data in enumerate(loader, 0):
    
        y     = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        
        ids  = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(
            input_ids         = ids, 
            attention_mask    = mask, 
            decoder_input_ids = y_ids, 
            labels            = lm_labels
        )
        loss = outputs[0]
        
        if _%10==0:
            logger.info(f'Epoch: {epoch+1} | Loss: {str(round(float(outputs[0]), 3))}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def validate(
    model: MT5ForConditionalGeneration,
    tokenizer: T5Tokenizer,
    epoch: int, 
    loader: SummaryDataSet,
    device: str = 'cuda' if cuda.is_available() else 'cpu'
) -> (list, list):    
    """
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for it, data in enumerate(loader, 0):
            y    = data['target_ids'].to(device, dtype = torch.long)
            ids  = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
              input_ids          = ids,
              attention_mask     = mask, 
              max_length         = 150, 
              num_beams          = 2,
              repetition_penalty = 2.5, 
              length_penalty     = 1.0, 
              early_stopping     = True
            )

            preds  = [
                tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) 
                for g in generated_ids
            ]
            target = [
                tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) 
                for t in y
            ]

            if _%10==0:
                logger.info(f'Completed {it}')

            predictions.extend(preds)
            actuals.extend(target)
            
    return predictions, actuals

In [None]:
def mT5Trainer(
    dataframe: pd.DataFrame, 
    source_text: str, 
    target_text: str, 
    model_params: dict, 
    output_dir: str ="/data/imeza/text_datasets/outputs_mT5/",
    device: str = 'cuda' if cuda.is_available() else 'cpu'
) -> None:
    """
    mT5 trainer

    """

    # logging
    logger.info(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = MT5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    logger.info(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text,target_text]]


    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    train_size = 0.8
    train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
    val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    logger.info(f"FULL Dataset : {dataframe.shape}")
    logger.info(f"TRAIN Dataset: {train_dataset.shape}")
    logger.info(f"TEST Dataset : {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = SummaryDataSet(
        train_dataset, 
        tokenizer, 
        model_params["MAX_SOURCE_TEXT_LENGTH"], 
        model_params["MAX_TARGET_TEXT_LENGTH"], 
        source_text, target_text
    )
    val_set      = SummaryDataSet(
        val_dataset, tokenizer, 
        model_params["MAX_SOURCE_TEXT_LENGTH"], 
        model_params["MAX_TARGET_TEXT_LENGTH"], 
        source_text, target_text
    )


    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader      = DataLoader(val_set, **val_params)


    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


    # Training loop
    logger.info(f'[Initiating Fine Tuning]...\n')

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(model, tokenizer, epoch, training_loader, optimizer, device)

    logger.info(f"[Saving Model]...\n")
    #Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    # evaluating test dataset
    logger.info(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(model, tokenizer, epoch, val_loader, device)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))

    logger.info(f"[Validation Completed.]\n")
    logger.info(f"[Model] Model saved @ {os.path.join(output_dir, 'model_files')}\n")
    logger.info(f"[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n")
    logger.info(f"[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n")

In [None]:
df = pd.read_parquet("/data/imeza/text_datasets/data_summarization_with_title.parquet")
df["text"] = "summarize: "+df["text"]

In [None]:
df.head()

In [None]:
mT5Trainer(dataframe=df, source_text="text", target_text="title", model_params=model_params)

In [None]:
path = '/data/imeza/text_datasets/outputs_mT5/model_files/28'

config = MT5Config.from_pretrained(model_params['MODEL'])
trained_model = MT5ForConditionalGeneration.from_pretrained(path+"/pytorch_model.bin", config=config)


In [None]:
text = """
"summarize: La entrega de los premios Musa del domingo pasado habría generado un conflicto que involucra a artistas urbanos como AK4:20, Pailita y Polimá. En la ceremonia, Pailita ganó como Mejor Artista Urbano y junto a Polimá ganaron Mejor Colaboración y Canción del Año por "Ultra Solo". Mientras Young Cister también se llevó dos galardones.Sin embargo, en Instagram reportaron que el equipo de AK4:20 tuvo duras palabras para los premios Musa. "Su premios 'kls' son 'si me das algo te nomino'. Son entero falsos programas 'ktm'", escribieron en el equipo del hombre de "Rata tan tan", terminando con "Telebasura". Mientras tanto, el manager del artista Bayron Fire también subió una historia con el texto: "Premios Musa son más chanta que la ropa de La Polar". Otro que escribió fue Balbi El Chamako, que entre otras cosas afirmó que "por culpa de los apitutados o 'wnes' cuicos que tienen cantando a sus regalones (...) ocupan espacios que se los merecían los que de verdad sí lucharon" y que "la mafia detrás del género, la desigualdad y el clasismo siempre nos dividirán". Por su parte, Pailita solo subió un escueto mensaje en su Instagram que dice: "Si un compañero mío gana, yo también gano. Lo felicito y me alegro por él. Esa es la hermandad"."""

tokenizer = T5Tokenizer.from_pretrained(model_params['MODEL'])
input_ids = tokenizer(text, return_tensors="pt").input_ids
 
generated_ids = trained_model.generate(input_ids, do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=0.7
)

summary = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
print(summary)

In [None]:
df = pd.read_parquet(
        "/data/imeza/text_datasets/data_summarization_with_title.parquet"
    ).sample(100, random_state=42)

In [None]:
df["text"] = "summarize: " + df["text"]

In [None]:
i = 3
text = df["text"].iloc[i]
title = df["title"].iloc[i]

In [None]:
input_ids = tokenizer(text, return_tensors="pt").input_ids
 
generated_ids = trained_model.generate(input_ids, do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=0.7
)

summary = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
print(f'\nTitulo original: {title}')
print(f'\nResumen del modelo: {summary}')