In [13]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm
from collections.abc import Iterable

In [14]:
def load_dataset(file_path:str):
    '''
    Imports the ColBERT dataset given a valid .CSV file.
    '''
    data = pd.read_csv(file_path)
    features = pd.read_csv(file_path, usecols=['text']).values
    labels = pd.read_csv(file_path, usecols=['humor']).values
    
    return features, labels

In [15]:
features, labels = load_dataset("data/dataset.csv")
example_datapoint = random.randint(0,200000)
print(features[example_datapoint])
print("humor: " + str(labels[example_datapoint]))

['What do you call a polish dancer? a stripper']
humor: [ True]


In [36]:
class HumorDetector(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(HumorDetector, self).__init__()
        
        #Creating layers of perceptron
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.lin = nn.Linear(64, 1)
        self.relu = nn.ReLU() 
    
    def forward(self, x):
        return self.lin(self.relu(self.embedding(x)))

In [37]:
#This code creaters the text_pipeline, which will transform a string of raw text into tensors that we can train on.
tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))

In [38]:
test_model = HumorDetector(len(vocab), 64)
print(text_pipeline((features[example_datapoint])[0]))

print(test_model(torch.tensor(text_pipeline((features[example_datapoint])[0]))))

[183, 423, 165, 683, 5, 4120, 14355, 80, 5, 89181]
tensor([[ 0.2540],
        [ 0.3674],
        [-0.0971],
        [ 0.0126],
        [-0.4518],
        [ 0.4242],
        [ 0.2024],
        [-0.0561],
        [-0.4518],
        [-0.0514]], grad_fn=<AddmmBackward0>)
