In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm

In [2]:
def load_dataset(file_path:str):
    '''
    Imports the ColBERT dataset given a valid .CSV file.
    '''
    data = pd.read_csv(file_path)
    features = pd.read_csv(file_path, usecols=['text']).values
    labels = pd.read_csv(file_path, usecols=['humor']).values
    
    return features, labels

In [3]:
features, labels = load_dataset("data/dataset.csv")
example_datapoint = random.randint(0,200000)
print(features[example_datapoint])
print("humor: " + str(labels[example_datapoint]))

["New year's travel: why smart tourists are headed to scotland (photos)"]
humor: [False]


In [4]:
class HumorDetector(nn.Module):
    def __init__(self):
        super(HumorDetector, self).__init__()
        
        #Creating layers of perceptron
        self.embedding = nn.Embedding(100,1)
        self.lin = nn.Linear(1, 1)
        self.relu = nn.ReLU() 
    
    def forward(self, x):
        return self.lin(self.relu(self.embedding(x)))

In [5]:
tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])



In [6]:
test_model = HumorDetector()
#test_output = test_model(example_datapoint)
print(vocab(example_datapoint))

TypeError: lookup_indices(): incompatible function arguments. The following argument types are supported:
    1. (self: torchtext._torchtext.Vocab, arg0: list) -> List[int]

Invoked with: <torchtext._torchtext.Vocab object at 0x7f1b07138630>, 83562