In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm
from collections.abc import Iterable

In [3]:
def load_dataset(file_path:str):
    '''
    Imports the ColBERT dataset given a valid .CSV file.
    '''
    data = pd.read_csv(file_path)
    features = pd.read_csv(file_path, usecols=['text']).values
    labels = pd.read_csv(file_path, usecols=['humor']).values
    
    return features, labels

In [4]:
features, labels = load_dataset("data/dataset.csv")
example_datapoint = random.randint(0,200000)
print(features[example_datapoint])
print("humor: " + str(labels[example_datapoint]))

['Pope francis in egypt: a voice of reason']
humor: [False]


In [33]:
class HumorDetector(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(HumorDetector, self).__init__()
        
        #Creating layers of perceptron
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.lin = nn.Linear(embed_dim, 1)
        self.relu = nn.ReLU() 
    
    def forward(self, x):
        return nn.Sigmoid()(torch.sum(self.lin(self.relu(self.embedding(x)))))

In [13]:
#This code creaters the text_pipeline, which will transform a string of raw text into tensors that we can train on.
tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))

In [65]:
test_model = HumorDetector(len(vocab), 64)

print((test_model(torch.tensor(text_pipeline((features[example_datapoint])[0])))).item())

0.35045555233955383


In [74]:
def training(model, loss_function, optimizer, features, labels, n_epochs, update_interval):
    
    losses = []

    for n in range(n_epochs):
        for i, text in enumerate(tqdm(features)):

            optimizer.zero_grad()
            my_output = model(text)
            loss = loss_function(my_output, labels[i])
            loss.backward()
            optimizer.step()
            
            if i % update_interval == 0:
                losses.append(round(loss.item(), 2)) # This will append your losses for plotting -- please use "loss" as the name for your loss
        
    return model, losses

In [75]:
#Hyperparameters
lr = 1e-4
n_epochs = 1
update_interval = 32

model = HumorDetector(len(vocab), 64) 
loss_function = nn.MSELoss()                        
optimizer = torch.optim.Adam(model.parameters(), lr=lr) 

trained_model, losses = training(model, loss_function, optimizer, features, labels, n_epochs, update_interval)

plt.plot(np.arange(len(losses)) * batch_size * update_interval, losses)
plt.title("training curve")
plt.xlabel("number of images trained on")
plt.ylabel("Reconstruction loss")
plt.show()

  0%|                                                | 0/200000 [00:00<?, ?it/s]


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not numpy.ndarray