In [None]:
%%capture
!pip install transformers
!pip install sentencepiece

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Running on: ', device)

Definicion de funciones

In [None]:
# Create Data Loader object
class MyDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_input_len):
        self.data_df = data_df
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        review = self.data_df.iloc[idx]['texts']
        topic = self.data_df.iloc[idx]['topic']

        # Truncate the review if it exceeds the maximum input length
        if len(review) > self.max_input_len:
            review = review[:self.max_input_len]

        input_ids = self.tokenizer.encode(review, max_length=self.max_input_len, pad_to_max_length=True, truncation=True)
        output_ids = self.tokenizer.encode(topic, max_length=16, pad_to_max_length=True, truncation=True)
        return input_ids, output_ids

###############################
###############################

def collate_fn(batch):
    # Separar las secuencias de entrada y salida del lote
    input_ids = [item[0] for item in batch]
    output_ids = [item[1] for item in batch]
    
    # Convertir las listas de Python a tensores de PyTorch y enviarlos a la GPU
    input_ids = torch.tensor(input_ids).to(device)
    output_ids = torch.tensor(output_ids).to(device)

    return input_ids, output_ids

Import data - Train

In [None]:
# Cargar los datos y preprocesarlos
data_df = pd.read_excel('/content/customer_reviews_data.xlsx')
data_df.dropna(subset = 'topic', inplace = True)

label = 'improvement'

reviews_df = data_df[(data_df.label_raw == label) ].reset_index(drop=True)[['RespondentID', 'texts', 'topic', 'subtopic']]

reviews_df.texts    = reviews_df.texts.astype(str)
reviews_df.topic    = reviews_df.topic.astype(str)
reviews_df.subtopic = reviews_df.subtopic.astype(str)

# Dividir los datos en conjuntos de entrenamiento y prueba
train_df, test_df = train_test_split(reviews_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(  drop=True, inplace=True)
test_df.reset_index( drop=True, inplace=True)

print(train_df.shape)

In [None]:
# Load tokenizer - Loand pre trained model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

max_input_len = 512
topic_sel = 'topic'

# Crear un objeto DataLoader para el conjunto de datos de entreno
train_dataset = MyDataset(train_df, tokenizer, max_input_len=max_input_len)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Crear un objeto DataLoader para el conjunto de datos de validacion
val_dataset = MyDataset(val_df, tokenizer, max_input_len=max_input_len)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Crear un objeto DataLoader para el conjunto de datos de prueba
test_dataset = MyDataset(test_df, tokenizer, max_input_len=max_input_len)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Entrenar el modelo con el DataLoader
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
for epoch in range(30):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        batch = [b.to(device) for b in batch]
        input_ids, output_ids = batch
        optimizer.zero_grad()
        loss = model(input_ids=input_ids, labels=output_ids).loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validate the model
    model.eval()
    with torch.no_grad():
      val_loss = 0
      for batch in val_dataloader:
        batch = [b.to(device) for b in batch]
        input_ids, output_ids = batch
        loss = model(input_ids=input_ids, labels=output_ids).loss
        val_loss += loss.item()
      val_loss /= len(val_dataloader)

    print(f"Epoch {epoch+1}: Train Loss={train_loss / len(train_dataloader):.4f}, Val Loss={val_loss:.4f}")

In [None]:
# Generar títulos para el conjunto de datos de prueba
model.eval()
title_list = []
for batch in test_dataloader:
    batch = [b.to(device) for b in batch]
    input_ids = batch[0]
    generated_titles = model.generate(input_ids, max_length=16, num_beams=4, no_repeat_ngram_size=2)
    for i in range(len(input_ids)):
        title = tokenizer.decode(generated_titles[i], skip_special_tokens=True)
        title_list.append(title) 

# Guardar los resultados en el DataFrame de prueba
test_df['topic_result'] = title_list


In [None]:
test_df.to_excel('../result_data.xlsx', index = False)
model.save_pretrained("../t5_model_improvement")