In [1]:
import os
import random
import numpy as np
import torch
import re

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dev = (
    torch.device("mps")
    if torch.backends.mps.is_available()
    else torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)

print(dev)

seed = 42

os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

mps


In [3]:
cache_dir = "./.cache/datasets"

imdb = load_dataset("imdb", cache_dir=cache_dir)
yelp = load_dataset("yelp_polarity", cache_dir=cache_dir)
amazon = load_dataset("amazon_polarity", cache_dir=cache_dir)

In [4]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
yelp

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 38000
    })
})

In [6]:
amazon

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})

In [42]:
print("text" in imdb["train"].column_names)

print(imdb["train"].column_names)

True
['text', 'label']


In [25]:
import sys


def total_size(obj, seen=None):
    """Recursively find the size of an object and its contents."""
    if seen is None:
        seen = set()
    size = sys.getsizeof(obj)
    obj_id = id(obj)

    if obj_id in seen:
        return 0

    # Mark as seen
    seen.add(obj_id)

    if isinstance(obj, dict):
        size += sum([total_size(v, seen) for v in obj.values()])
        size += sum([total_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, "__dict__"):
        size += total_size(obj.__dict__, seen)
    elif hasattr(obj, "__iter__") and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([total_size(i, seen) for i in obj])

    return size

266416012
883134283
6774809285


In [17]:
imdb_train_val = amazon["train"].train_test_split(test_size=0.2, seed=42)  # type: ignore
imdb_train_dataset = imdb_train_val["train"]
imdb_val_dataset = imdb_train_val["test"]

In [8]:
data_text = [
    title.upper() + ": " + content
    for title, content in zip(amazon["train"]["title"], amazon["train"]["content"])
]

In [9]:
data = list(
    zip(
        amazon["train"]["label"],
        data_text,
    )
)

In [10]:
len(data)

3600000

In [29]:
test = random.sample(data, 100_000)

In [37]:
len(test)

100000

In [30]:
test

[(0,
  'AVOID!: This DVD player produces great picture quality, but has two fatal defects. With most DVDs, it will randomly pause for a few seconds, and sometimes freeze completely. It is also very slow to respond to the remote. Do not buy this piece of junk.'),
 (1,
  "EXCELLENT PERFORMANCE OF 1968 BAND: This is a particularly inspired outting for the 1968 band. This recording probably comes from their July 1968 European tour, but the CD doesn't give a specific date (and the CD does not correspond with any of the performances documented in Sheridan's COUNT BASIE: A BIO-DISCOGRAPHY). Compared with the Juan-les-Pins performance of July 23, the repertoire is similar but the performances here are better -- the rhythm section in particular is very spontaneous -- and the recording quality is above average (and stereo).Many tracks are misidentified: (2) is Blues for Ilean, (5) is Boone's Blues, (6) is Whirly-Bird, and (10) is a blues in C for the rhythm section -- 4 minutes of Basie's sparkl

In [31]:
test_labels, test_text = list(zip(*test))

In [33]:
print(total_size(imdb))
print(total_size(yelp))
print(total_size(amazon))
print(total_size(test_text))

266416012
883134283
6774809125
48836258


In [36]:
test_labels = list(test_labels)

print(len([label for label in test_labels if label == 0]))
print(len([label for label in test_labels if label == 1]))

50147
49853


In [34]:
list(test_text)

['AVOID!: This DVD player produces great picture quality, but has two fatal defects. With most DVDs, it will randomly pause for a few seconds, and sometimes freeze completely. It is also very slow to respond to the remote. Do not buy this piece of junk.',
 "EXCELLENT PERFORMANCE OF 1968 BAND: This is a particularly inspired outting for the 1968 band. This recording probably comes from their July 1968 European tour, but the CD doesn't give a specific date (and the CD does not correspond with any of the performances documented in Sheridan's COUNT BASIE: A BIO-DISCOGRAPHY). Compared with the Juan-les-Pins performance of July 23, the repertoire is similar but the performances here are better -- the rhythm section in particular is very spontaneous -- and the recording quality is above average (and stereo).Many tracks are misidentified: (2) is Blues for Ilean, (5) is Boone's Blues, (6) is Whirly-Bird, and (10) is a blues in C for the rhythm section -- 4 minutes of Basie's sparkling piano, in

In [17]:
type(test_labels)

tuple

In [35]:
print([len([label for label in imdb_train_dataset["label"] if label == 0])])
print([len([label for label in imdb_train_dataset["label"] if label == 1])])
print([len([label for label in imdb_val_dataset["label"] if label == 0])])
print([len([label for label in imdb_val_dataset["label"] if label == 1])])

NameError: name 'imdb_train_dataset' is not defined

In [8]:
imdb_train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 20000
})

In [9]:
imdb_train_dataset["text"]

['Stage adaptations often have a major fault. They often come out looking like a film camera was simply placed on the stage (Such as "Night Mother"). Sidney Lumet\'s direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. Still, it\'s nice to look at for what it is. The chemistry between Michael Caine and Christopher Reeve is quite brilliant. The dynamics of their relationship are surprising. Caine is fantastic as always, and Reeve gets one of his few chances to really act.<br /><br />I confess that I\'ve never seen Ira Levin\'s play, but I hear that Jay Presson Allen\'s adaptation is faithful. The script is incredibly convoluted, and keeps you guessing. "Deathtrap" is an enormously entertaining film, and is recommended for nearly all fans of stage and screen.<br /><br />7.4 out of 10',
 "'The Rookie' was a wonderful movie about the second chances life holds for us and also puts an emotional thought over the audience, making them r

In [10]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # Replace single quotes that are not preceded by a backslash
    text = re.sub(r"(?<!\\)'", '"', text)
    return text

In [13]:
amazon["train"]["title"]

['Stuning even for the non-gamer',
 'The best soundtrack ever to anything.',
 'Amazing!',
 'Excellent Soundtrack',
 'Remember, Pull Your Jaw Off The Floor After Hearing it',
 'an absolute masterpiece',
 'Buyer beware',
 'Glorious story',
 'A FIVE STAR BOOK',
 'Whispers of the Wicked Saints',
 'The Worst!',
 'Great book',
 'Great Read',
 'Oh please',
 'Awful beyond belief!',
 "Don't try to fool us with fake reviews.",
 'A romantic zen baseball comedy',
 'Fashionable Compression Stockings!',
 'Jobst UltraSheer Thigh High',
 'sizes recomended in the size chart are not real',
 'mens ultrasheer',
 'Delicious cookie mix',
 'Another Abysmal Digital Copy',
 'A fascinating insight into the life of modern Japanese teens',
 'i liked this album more then i thought i would',
 'Problem with charging smaller AAAs',
 'Works, but not as advertised',
 'Disappointed',
 'Oh dear',
 "Based on the reviews here I bought one and I'm glad I did!",
 'Incorrect disc!',
 'happy with it...but',
 'should be titled 

In [14]:
amazon["train"]["content"]

['This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^',
 "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.",
 'This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all th

In [11]:
imdb_train_labels = imdb_train_dataset["label"]
imdb_val_labels = imdb_val_dataset["label"]

imdb_train_text = [
    clean_text(text) for _, text in enumerate(imdb_train_dataset["text"])
]
imdb_val_text = [clean_text(text) for _, text in enumerate(imdb_val_dataset["text"])]

In [12]:
from transformers import BertTokenizer, BertModel

model_name = "bert-base-uncased"
cache_folder = "./.cache/huggingface"
bert_tokenizer = BertTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_folder,
    device_map="auto",
)
bert_model = BertModel.from_pretrained(
    model_name,
    cache_dir=cache_folder,
    device_map="auto",
)



In [13]:
# Print the special tokens
print("CLS token:", bert_tokenizer.cls_token)
print("SEP token:", bert_tokenizer.sep_token)
print("PAD token:", bert_tokenizer.pad_token)
print("UNK token:", bert_tokenizer.unk_token)
print("MASK token:", bert_tokenizer.mask_token)

# Alternatively, print all special tokens and their IDs
print("All special tokens:", bert_tokenizer.special_tokens_map)
print("All special tokens with IDs:", bert_tokenizer.special_tokens_map_extended)

CLS token: [CLS]
SEP token: [SEP]
PAD token: [PAD]
UNK token: [UNK]
MASK token: [MASK]
All special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
All special tokens with IDs: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [14]:
tokens = bert_tokenizer(
    imdb_train_text, return_tensors="pt", padding=True, truncation=True, max_length=128
)

In [15]:
from pprint import pprint

pprint(tokens)

{'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]),
 'input_ids': tensor([[  101,  2754, 17241,  ...,  2963,  2008,   102],
        [  101,  1000,  1996,  ...,     0,     0,     0],
        [  101,  7929,  1010,  ...,  1012,  1045,   102],
        ...,
        [  101,  3087,  2040,  ...,  4025,  2000,   102],
        [  101,  2054,  2079,  ...,  2987,  1000,   102],
        [  101,  1045,  3342,  ...,  2876,  1000,   102]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


In [16]:
bert_tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

['[CLS]',
 'stage',
 'adaptations',
 'often',
 'have',
 'a',
 'major',
 'fault',
 '.',
 'they',
 'often',
 'come',
 'out',
 'looking',
 'like',
 'a',
 'film',
 'camera',
 'was',
 'simply',
 'placed',
 'on',
 'the',
 'stage',
 '(',
 'such',
 'as',
 '"',
 'night',
 'mother',
 '"',
 ')',
 '.',
 'sidney',
 'lu',
 '##met',
 '"',
 's',
 'direction',
 'keeps',
 'the',
 'film',
 'alive',
 ',',
 'which',
 'is',
 'especially',
 'difficult',
 'since',
 'the',
 'picture',
 'offered',
 'him',
 'no',
 'real',
 'challenge',
 '.',
 'still',
 ',',
 'it',
 '"',
 's',
 'nice',
 'to',
 'look',
 'at',
 'for',
 'what',
 'it',
 'is',
 '.',
 'the',
 'chemistry',
 'between',
 'michael',
 'caine',
 'and',
 'christopher',
 'reeve',
 'is',
 'quite',
 'brilliant',
 '.',
 'the',
 'dynamics',
 'of',
 'their',
 'relationship',
 'are',
 'surprising',
 '.',
 'caine',
 'is',
 'fantastic',
 'as',
 'always',
 ',',
 'and',
 'reeve',
 'gets',
 'one',
 'of',
 'his',
 'few',
 'chances',
 'to',
 'really',
 'act',
 '.',
 'i',
 

In [17]:
with torch.no_grad():
    # outputs = bert_model(**tokens)
    outputs = bert_model(tokens["input_ids"], attention_mask=tokens["attention_mask"])
    word_embeddings = outputs.last_hidden_state

In [None]:
torch.save(word_embeddings, "imdb_word_embeddings.pt")

In [None]:
word_embeddings = torch.load("word_embeddings.pt")

In [None]:
word_embeddings.shape

torch.Size([1, 128, 768])

In [None]:
print(len(*word_embeddings))

128


In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=8)
# X_train_pca = pca.fit_transform(word_embeddings)

ValueError: Found array with dim 3. PCA expected <= 2.

In [None]:
X_train_pca.shape

In [None]:
import math
import torch
import torch.nn as nn
from torch import Tensor, device


class PositionalEncoder(nn.Module):

    def __init__(self, d_model: int, dropout=0.1, max_len=512, device=device("cpu")):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, 1, d_model, device=device)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        seq_len = x.size(1)
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)

In [None]:
# train_data = [(int(label), text) for label, text in train_data]
# test_data = [(int(label), text) for label, text in test_data]

# print("train pos: ", len([label for label, text in train_data if label == 0]))
# print("train neg: ", len([label for label, text in train_data if label == 1]))
# print("test pos: ", len([label for label, text in test_data if label == 0]))
# print("test neg: ", len([label for label, text in test_data if label == 1]))

In [None]:
# train_data[0:5]

In [None]:
# type(train_data[0][0])

In [None]:
# [float(label) for label in imdb_train_labels]