In [1]:
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam
from transformers import BertTokenizer, BertForSequenceClassification, logging
import pandas as pd
import torch

In [2]:
# Disable logging warning
logging.set_verbosity_error()

In [3]:
tokenizer = BertTokenizer.from_pretrained("gaunernst/bert-tiny-uncased", do_lower_case=True)
raw_data = pd.read_csv("data.csv")

In [4]:
raw_data

Unnamed: 0,Favours,None,Hater,Freaky
0,"Hey, I was wondering if you could help me move...","Hey, what’s up?",There’s no cure for your level of stupidity.,Yo this ta actually had the sexiest mustache
1,Do you have a minute? I need a small favor: co...,Morning! How’s it going?,Watching you try to think is like watching pai...,Double cheeked up**
2,"I hate to ask, but could you pick up my dry cl...",What’s new today?,"If ignorance were a crime, you’d be serving a ...",âHeâs cheeked upâ - Moulikð
3,Quick question: could you help me with my car ...,Just checking in—how are you?,The world would be better off if you stayed qu...,the only thing sexier than him in that pic is me
4,Would you mind lending me a hand with rearrang...,Have you eaten yet? Pancakes here were a win.,You’re like a software update—completely unnec...,hes got a dumpy
...,...,...,...,...
331,,at home,,
332,,iâd invite you if you were in cali,,
333,,I think I'm supposed to say things are moving ...,,
334,,yes,,


In [5]:
# Getting max encoding length
max_length = 0

for category in raw_data:
    for sentence in raw_data[category]:
        
        if (str(sentence) != "nan"):
            
            sentence = sentence.replace("?", "").replace(".", "")
            
            encoding = tokenizer.encode(sentence)

            max_length = max(len(encoding), max_length)

print(max_length)

33


In [6]:
# Gathering data
data = []
labels = []
masks = []

labels_dict = {
    "None" : 0,
    "Favours" : 1,
    "Hater" : 2,
    "Freaky" : 3
}

for category in raw_data:

    for sentence in raw_data[category]:
        
        if (str(sentence) != "nan"):
            
            sentence = sentence.replace("?", "").replace(".", "")
            
            encoding = tokenizer.encode_plus(
                sentence,                     
                add_special_tokens = True, 
                max_length = max_length,
                padding='max_length',
                return_attention_mask = True, 
                return_tensors = 'pt',
                truncation = True
            )
            
            data.append(encoding["input_ids"])
            labels.append(labels_dict[category])
            masks.append(encoding["attention_mask"])

# Converting to tensors
data = torch.cat(data)
labels = torch.tensor(labels)
masks = torch.cat(masks)
        
print(len(data), len(labels), len(masks))

922 922 922


In [7]:
# Creating dataloader
dataset = TensorDataset(data, labels, masks)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initializing model
model = BertForSequenceClassification.from_pretrained(
    "gaunernst/bert-tiny-uncased", 
    num_labels = 4,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()
model.train()

# Initializing optimizer
optimizer = Adam(model.parameters(),
  lr = 1e-5,
  eps = 1e-8
)

# Train for 35 epochs
for i in range(35):
    
    c_loss = 0
    correct = 0
    total = 0
    
    correct_val = 0
    total_val = 0
    
    flag = False
    
    for batch in dataloader:
        
        data_batch, labels_batch, masks_batch = batch

        # Moving batch tensors to GPU
        data_batch = data.to("cuda")
        labels_batch = labels.to("cuda")
        masks_batch = masks.to("cuda")

        model.zero_grad() 

        # Getting model output
        x = model(data_batch, token_type_ids=None, attention_mask=masks_batch, labels=labels_batch)

        # Getting batch loss & predictions
        loss_batch = x.loss
        pred_batch = torch.argmax(torch.nn.functional.softmax(x.logits, dim=1), dim=1)

        # Calculating accuracy
        correct += (pred_batch == labels_batch).sum().item()
        total += len(data_batch)

        # Calculating loss
        c_loss += x.loss.item()

        # Optimizing
        loss_batch.backward()
        optimizer.step()
        
    print("Epoch loss:", c_loss, "Epoch accuracy:", correct / total)
    print()


Epoch loss: 38.952208042144775 Epoch accuracy: 0.3622933652479617

Epoch loss: 38.207788825035095 Epoch accuracy: 0.36857655770813075

Epoch loss: 37.277395367622375 Epoch accuracy: 0.3768793477447827

Epoch loss: 36.29804742336273 Epoch accuracy: 0.4106141072630713

Epoch loss: 35.25243937969208 Epoch accuracy: 0.4867604158875009

Epoch loss: 34.105157256126404 Epoch accuracy: 0.5660857206971351

Epoch loss: 32.64750123023987 Epoch accuracy: 0.6061410726307128

Epoch loss: 31.22864079475403 Epoch accuracy: 0.6248784501458599

Epoch loss: 29.87157893180847 Epoch accuracy: 0.6484030219163737

Epoch loss: 28.53951507806778 Epoch accuracy: 0.6792953848455382

Epoch loss: 27.35513937473297 Epoch accuracy: 0.7129927444087067

Epoch loss: 26.09543615579605 Epoch accuracy: 0.749308100830279

Epoch loss: 24.932298958301544 Epoch accuracy: 0.7767970678435185

Epoch loss: 23.778303742408752 Epoch accuracy: 0.8006582392101129

Epoch loss: 22.58139204978943 Epoch accuracy: 0.8220884134939038

Epoc

In [8]:
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [9]:
model.to("cpu")
model.eval()

def predict_on_sentence(sentence):
    
    labels_dict_reverse = {
        0 : "None",
        1 : "Favour",
        2 : "Hater",
        3 : "Freaky"
    }
    
    softmax_out = torch.argmax(torch.nn.functional.softmax(
         model(torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)).logits, 
         dim=1
    ))
    
    return labels_dict_reverse[softmax_out.item()]
    

In [15]:
print(predict_on_sentence("can i borrow your charger"))
print(predict_on_sentence("bro fuck you"))
print(predict_on_sentence("imma touch you"))
print(predict_on_sentence("I'm a sigma"))

Favour
Hater
Freaky
None


In [11]:
import time

# Creating test batch
test = []

for i in range(1000):
    encoding = tokenizer.encode_plus(
        "you are so stupid",                     
        add_special_tokens = True, 
        max_length = max_length,
        padding='max_length',
        return_attention_mask = True, 
        return_tensors = 'pt',
        truncation = True
    )
    
    test.append(encoding["input_ids"])
    
test = torch.cat(test)

# Timing computation
start = time.time()

model(test)

print(time.time() - start)

0.25215673446655273
