In [1]:
#PyTorch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch

#transformer
from transformers import BertForMaskedLM, DistilBertTokenizer, DistilBertModel, BertTokenizer, BertForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput

#dataset
from datasets import load_dataset
import pandas as pd
import numpy as np

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#utils
from sklearn import metrics
from tqdm.notebook import tqdm_notebook

## Load datasets

In [2]:
id_to_label = {
    0: "Juan Jouglard",
    1: "Mopi",
    2: "Juan Dimatz",
    3: "Agus Streiten",
    4: "Agus Superi",
    5: "Mati Gonzalez",
    6: "Bian Artola",
    7: "German Gomez",
    8: "Juan Olmedo",
    9: "Mati Garcia",
    10: "Igna Suburu",
    11: "Tomi Felder",
    12: "Masa",
}

label_to_id = {
    "Juan Jouglard": 0,
    "Mopi": 1,
    "Juan Dimatz": 2,
    "Agus Streiten": 3,
    "Agus Superi": 4,
    "Mati Gonzalez": 5,
    "Bian Artola": 6,
    "German Gomez": 7,
    "Juan Olmedo": 8,
    "Mati Garcia": 9,
    "Igna Suburu": 10,
    "Tomi Felder": 11,
    "Masa": 12,
}

In [3]:
def transform_to_one_hot_encoding(label):
    one_hot = np.zeros(13, dtype=int)
    if label in label_to_id.keys():
        one_hot[label_to_id[label]] = 1
    return one_hot

In [4]:
#alloy_ds = load_dataset("csv", data_files={"train": "../wp/train_alloy_ds.csv", "test": "../wp/test_alloy_ds.csv"})
alloy_ds = pd.read_csv("../wp/alloy_dataset")
alloy_ds["label"] = alloy_ds["label"].map(transform_to_one_hot_encoding)

In [5]:
alloy_ds = alloy_ds.tail(1000) #Keep only first 100 rows for educational purposes

In [6]:
alloy_ds

Unnamed: 0,label,text
69470,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",ahora sí voy a ir
69471,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",para joderte a vos nomás
69472,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",sos puro humo bro
69473,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",hagan esa tierlist asi nos cagamos bien a piñas
69474,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",el problema es que me gusta todo
...,...,...
70465,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",por obvios motivos todavía no estoy en condici...
70466,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",yo tengo oficina por ahora
70467,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",me gusta ña idea
70468,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",pero no tengo note del laburo


## Load model and tokenizer

In [7]:
# create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("pytorch/", do_lower_case=False, num_labels=13)
#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
base_model = BertForSequenceClassification.from_pretrained("pytorch/")
#base_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
e = base_model.eval()

Some weights of the model checkpoint at pytorch/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Custom parameters

In [8]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05

## Tokenize dataset

In [9]:
def tokenize_message(text):
        t = tokenizer(text, padding="max_length",add_special_tokens=True, truncation=True, max_length=MAX_LEN, return_token_type_ids=True)
        return t

## Custom Dataset class

In [10]:
class WhatsappDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def tokenize_message(self, text):
        t = self.tokenizer(text,
                           padding="max_length",
                           add_special_tokens=True,
                           truncation=True,
                           max_length=MAX_LEN,
                           return_token_type_ids=True)
        return t

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenize_message(
            text
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


### Create proper datasets

In [11]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=alloy_ds.sample(frac=train_size,random_state=200)
test_dataset=alloy_ds.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(alloy_ds.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = WhatsappDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = WhatsappDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1000, 2)
TRAIN Dataset: (800, 2)
TEST Dataset: (200, 2)


### Create dataloaders

In [12]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Create custom model

In [13]:
class WhatsappPredictor(torch.nn.Module):
    def __init__(self):
        super(WhatsappPredictor, self).__init__()
        self.l1 = base_model
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(31002, 13)
        self.l4 = torch.nn.Softmax(dim=0)
    
    def forward(self, ids, mask, token_type_ids):
        output_1, = self.l1(
            ids,
            attention_mask = mask,
            token_type_ids=token_type_ids,
            return_dict=False)
        index = torch.tensor([0]).to(device)
        pool_output = torch.index_select(output_1, 1, index).squeeze() #Keeps only the output for the first character in each sentence
        drop_out = self.l2(pool_output)
        #print("Output 1", pool_output.shape) # tensor[batch_size][max_length][vocab_size] 4x200x31000
        linear_output = self.l3(drop_out)
        probability = self.l4(linear_output)
        #output = self.l3(output_2)
        return probability

model = WhatsappPredictor()
model.to(device)

WhatsappPredictor(
  (l1): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(31002, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
          

### Loss function and optimizer

In [14]:
def loss_fn(outputs, targets):
    label = targets.argmax(1)
    return torch.nn.CrossEntropyLoss()(outputs, label)

In [15]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Train model

In [16]:
def train(epoch):
    model.train()
    for _,data in enumerate(tqdm_notebook(training_loader), 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        #print(f"Model output: {outputs.shape}")
        
        #print(f"Model output: {outputs[0]}")
        #print(f"Actual target: {targets[0]}")
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [17]:
for epoch in range(EPOCHS):
    train(epoch)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, Loss:  2.611661195755005


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 1, Loss:  2.6951305866241455


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 2, Loss:  2.5038177967071533


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 3, Loss:  2.206146001815796


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 4, Loss:  2.2506237030029297


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 5, Loss:  2.4155356884002686


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 6, Loss:  2.3714358806610107


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 7, Loss:  2.1632080078125


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 8, Loss:  2.200254440307617


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 9, Loss:  2.1568808555603027


## Validating model

In [18]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(tqdm_notebook(testing_loader), 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [19]:
outputs, targets = validation(1)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

  0%|          | 0/50 [00:00<?, ?it/s]

Accuracy Score = 0.0
F1 Score (Micro) = 0.1402002861230329
F1 Score (Macro) = 0.13407803658367115


In [20]:
def predict(text):
    tokens = tokenize_message(text)
    input_ids = torch.tensor([tokens["input_ids"]], dtype=torch.long).to(device, dtype = torch.long)
    attention_mask = torch.tensor([tokens["attention_mask"]], dtype=torch.long).to(device, dtype = torch.long)
    token_type_ids = torch.tensor([tokens["token_type_ids"]], dtype=torch.long).to(device, dtype = torch.long)
    
    with torch.no_grad():
        ids = input_ids.to(device, dtype = torch.long)
        mask = attention_mask.to(device, dtype = torch.long)
        token_type_ids = token_type_ids.to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids)
  
    return id_to_label[outputs.argmax().item()]

In [21]:
predict("se recontra pico")

'Juan Dimatz'