In [None]:
!pip install transformers -q
!pip install wandb -q
!pip install SentencePiece -q
!wandb login


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import TFMT5Model, T5Tokenizer , MT5ForConditionalGeneration
import wandb
import pandas as pd
from sklearn.model_selection import train_test_split

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [8]:
# Create a custom dataset for reading the dataframe and loading it into the dataloader 
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [9]:

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%10==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
      

In [10]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
df = pd.read_csv("Small2_dataset.csv" ,encoding="utf8")
df['Text'] = "summarize: " + df['Text']

In [12]:
df = pd.read_csv('Small2_dataset.csv',encoding='utf-8')
df.columns = ['ctext', 'text']
df.ctext = 'summarize: ' + df.ctext
df['ctext']=df['ctext'][:30]
df['text']=df['text'][:30]

df['ctext'][3] = df['ctext'][29]
df['text'][3] = df['text'][29]
print(df.head())

                                               ctext  \
0  summarize: افرد ذراعك أمامك مع ثني الكوع. يجب ...   
1  summarize: قد تتمكن من جعل طعم الأرز العادي يش...   
2  summarize: الكلب المصاب بالجفاف سيعمل على الوص...   
3  summarize: ينبغي التحضير بحسب إذا كان الجو شدي...   
4  summarize: لتنظيف أنفك، سد فتحة أنف واحدة بالم...   

                                                text  
0  ضع المرطب على شكل خط أعلى كل من الساعدين وظهر ...  
1  اعلم ما يجب عليك توقعه. غط القدر بغطاء ودعه يغ...  
2  راقب سلوك الكلب. افحص مؤخرة عنق الكلب. افحص لث...  
3  تأكد من ارتداء أطفالك لملابس مناسبة. تعرف على ...  
4  تنظيف أنفك بشكل صحيح. الحصول على الراحة. الحصو...  


In [13]:

wandb.init(project="transformers_tutorials_summarization")

config = wandb.config         
config.TRAIN_BATCH_SIZE = 1  
config.VALID_BATCH_SIZE = 1    
config.TRAIN_EPOCHS = 1        
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 1e-4    
config.SEED = 42               
config.MAX_LEN = 512
config.SUMMARY_LEN = 150 


torch.manual_seed(config.SEED) 
np.random.seed(config.SEED)
torch.backends.cudnn.deterministic = True


tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")





# Create of Dataset and Dataloader

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = config.SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)



train_dataset.columns = train_dataset.columns.str.strip()


#  create dataset ready to use in  Dataloader
training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

# parameters fo dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Create  Dataloaders 
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)



model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
model = model.to(device)



In [None]:

# Defining the optimizer that 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)


wandb.watch(model, log="all")



print('Fine-Tuning for the model on our dataset')

for epoch in range(config.TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)



for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('./models/predictions.csv')
    print('Output Files generated for review')


In [None]:
string = input("Enter text")
Test = CustomDataset([string , ""], tokenizer, config.MAX_LEN, config.SUMMARY_LEN)


Test_loader = DataLoader(val_set, {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    } )

predictions, actuals = validate(epoch, tokenizer, model, device, Test_loader)
print('Generated Text' ,predictions)
print('Actual Text',actuals)