In [1]:
#PyTorch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch

#transformer
from transformers import BertForSequenceClassification, DistilBertTokenizer, DistilBertModel, BertTokenizer
from transformers.modeling_outputs import SequenceClassifierOutput

#dataset
from datasets import load_dataset
import pandas as pd
import numpy as np

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#utils
from sklearn import metrics, preprocessing
from tqdm.notebook import tqdm_notebook

## Load datasets

In [2]:
def transform_to_one_hot_encoding(encoder, label, number_of_labels):
    one_hot = np.zeros(number_of_labels, dtype=int)
    index = encoder.transform(label)
    one_hot[index] = 1
    return one_hot

In [3]:
#alloy_ds = load_dataset("csv", data_files={"train": "../wp/train_alloy_ds.csv", "test": "../wp/test_alloy_ds.csv"})
alloy_ds = pd.read_csv("./alloy_dataset.csv")
#alloy_ds["label"] = alloy_ds["label"].map(transform_to_one_hot_encoding)

In [4]:
#alloy_ds = alloy_ds.tail(1000) #Keep only first 100 rows for educational purposes

In [5]:
label_enc = preprocessing.LabelEncoder()
#label_enc.fit(alloy_ds["label"])
#label_enc.classes_
label_enc.fit(alloy_ds["label"])

LabelEncoder()

In [6]:
parsed_dataset = alloy_ds.copy()
parsed_dataset["label"] = alloy_ds["label"].apply(lambda x: transform_to_one_hot_encoding(label_enc, [x], len(label_enc.classes_)))
parsed_dataset

Unnamed: 0,label,text
0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",amiguitos nuevos
1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",quien hace un resumen
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",quiero que copies toda la conversación bian
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",yo si querés hago desde el comentario 1 al 12
4,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",jajajajajaja
...,...,...
70465,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",por obvios motivos todavía no estoy en condici...
70466,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",yo tengo oficina por ahora
70467,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",me gusta ña idea
70468,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",pero no tengo note del laburo


## Load model and tokenizer

In [7]:
# create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("pytorch/", do_lower_case=False, num_labels=len(label_enc.classes_))
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
base_model = BertForSequenceClassification.from_pretrained("pytorch/", num_labels=len(label_enc.classes_))
#base_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
e = base_model.eval()

Some weights of the model checkpoint at pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pytorch/

### Custom parameters

In [8]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05

## Tokenize dataset

In [9]:
def tokenize_message(text):
        t = tokenizer(text, padding="max_length",add_special_tokens=True, truncation=True, max_length=MAX_LEN, return_token_type_ids=True)
        return t

## Custom Dataset class

In [10]:
class WhatsappDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def tokenize_message(self, text):
        t = self.tokenizer(text,
                           padding="max_length",
                           add_special_tokens=True,
                           truncation=True,
                           max_length=MAX_LEN,
                           return_token_type_ids=True)
        return t

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenize_message(
            text
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


### Create proper datasets

In [11]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=parsed_dataset.sample(frac=train_size,random_state=200)
test_dataset=parsed_dataset.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(parsed_dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = WhatsappDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = WhatsappDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (70470, 2)
TRAIN Dataset: (56376, 2)
TEST Dataset: (14094, 2)


### Create dataloaders

In [12]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Create custom model

In [13]:
class WhatsappPredictor(torch.nn.Module):
    def __init__(self):
        super(WhatsappPredictor, self).__init__()
        self.l1 = base_model
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(31002, 13)
        self.l4 = torch.nn.Softmax(dim=0)
    
    def forward(self, ids, mask, token_type_ids):
        output_1, = self.l1(
            ids,
            attention_mask = mask,
            token_type_ids=token_type_ids,
            return_dict=False)
        #index = torch.tensor([0]).to(device)
        #pool_output = torch.index_select(output_1, 1, index).squeeze() #Keeps only the output for the first character in each sentence
        #drop_out = self.l2(pool_output)
        #print("Output 1", pool_output.shape) # tensor[batch_size][max_length][vocab_size] 4x200x31000
        #linear_output = self.l3(drop_out)
        #probability = self.l4(linear_output)
        #output = self.l3(output_2)
        return output_1

model = WhatsappPredictor()
model.to(device)

WhatsappPredictor(
  (l1): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(31002, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tr

### Loss function and optimizer

In [14]:
def loss_fn(outputs, targets):
    label = targets.argmax(1)
    return torch.nn.CrossEntropyLoss()(outputs, label)

In [15]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Train model

In [16]:
def train(epoch):
    model.train()
    for _,data in enumerate(tqdm_notebook(training_loader), 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.int)
        
        outputs = model(ids, mask, token_type_ids)
        #print(f"Model output: {outputs.shape}")
        
        #print(f"Model output: {outputs[0]}")
        #print(f"Actual target: {targets[0]}")
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [17]:
for epoch in range(EPOCHS):
    train(epoch)

  0%|          | 0/7047 [00:00<?, ?it/s]

Epoch: 0, Loss:  2.0491652488708496


  0%|          | 0/7047 [00:00<?, ?it/s]

Epoch: 1, Loss:  2.3487930297851562


  0%|          | 0/7047 [00:00<?, ?it/s]

Epoch: 2, Loss:  1.764507532119751


  0%|          | 0/7047 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Validating model

In [None]:
def validation(epoch):
    total_tests = 0
    correct_tests = 0
    
    model.eval()
    with torch.no_grad():
        for index, data in enumerate(tqdm_notebook(testing_loader), 0):
            total_tests += 1
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.int)
            outputs = model(ids, mask, token_type_ids)
            if (targets.argmax().item() == outputs.argmax().item()):
                print(f"Correct test: {test_dataset.loc[index]}")
                correct_tests += 1
    return total_tests, correct_tests

In [None]:
total_tests, correct_tests = validation(1)
print(f"Total tests: {total_tests} - Correct tests: {correct_tests} - accuracy: {correct_tests/total_tests}")
#outputs = np.array(outputs) >= 0.5
#accuracy = metrics.accuracy_score(targets, outputs)
#f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
#f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
#print(f"Accuracy Score = {accuracy}")
#print(f"F1 Score (Micro) = {f1_score_micro}")
#print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
def predict(text):
    tokens = tokenize_message(text)
    input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long).to(device, dtype = torch.long)
    attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long).to(device, dtype = torch.long)
    token_type_ids = torch.tensor([tokens["token_type_ids"]], dtype=torch.long).to(device, dtype = torch.long)
    
    with torch.no_grad():
        ids = input_ids.to(device, dtype = torch.long)
        mask = attention_mask.to(device, dtype = torch.long)
        token_type_ids = token_type_ids.to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids)
  
    return label_enc.inverse_transform(outputs.cpu().argmax(1))