In [None]:
import datasets

train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

In [None]:
train_data[0]

In [None]:
from nltk.tokenize import word_tokenize
import numpy as np

def tokenize_example(example, max_length):
    tokens = word_tokenize(example["text"])[:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}

max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"max_length": max_length}
)

In [33]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [None]:
import torchtext

min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [None]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

vocab.set_default_index(unk_index)

In [None]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

In [None]:
import nltk
from nltk.corpus import gutenberg

# Download required corpora if not already downloaded
nltk.download('gutenberg')
corpus_name = "austen-emma.txt"

# Load the text data
corpus_text = gutenberg.raw(corpus_name)

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/ngtzekean/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
import gensim
import gensim.downloader

# Load the pre-trained Google News Word2Vec model
model = gensim.downloader.load('word2vec-google-news-300')
word = "king"
embedding = model[word]
print(embedding)

[=====---------------------------------------------] 11.8% 196.9/1662.8MB downloaded