In [1]:
import os
from google.colab import userdata

# Set the API token as an environment variable
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

Archive chat link: https://chatgpt.com/c/a4666297-3f4e-4afb-938c-4e174f9b2308?model=gpt-4

In [None]:
!git clone https://github.com/nlp-with-transformers/notebooks.git
%cd notebooks
from install import *
install_requirements(is_chapter2=True)

!pip install datasets==2.11.0

Cloning into 'notebooks'...
remote: Enumerating objects: 526, done.[K
remote: Counting objects: 100% (526/526), done.[K
remote: Compressing objects: 100% (289/289), done.[K
remote: Total 526 (delta 251), reused 481 (delta 231), pack-reused 0[K
Receiving objects: 100% (526/526), 29.30 MiB | 7.75 MiB/s, done.


In [None]:
from utils import *
setup_chapter()

In [None]:
from datasets import list_datasets

all_datasets = list_datasets()
print(f"Number of datasets currently available on the Hub: {len(all_datasets)}")
print(f"First 10 datasets: {all_datasets[:10]}")

#emotion_datasets = [dataset for dataset in all_datasets if dataset.endswith("emotion")]
#print(emotion_datasets)


In [None]:
# hide_output
from datasets import load_dataset

emotions = load_dataset("jeffnyman/emotions")
print(emotions)

In [None]:
train_ds = emotions["train"]
train_ds[0]

In [None]:
print(train_ds.column_names)
print(train_ds.features)
print(train_ds[:5])

GPT Links Archive:
- CS EE Notes - Keras Simplifies Neural Networks: https://chatgpt.com/c/a74b5448-22eb-47d4-8329-88f6e7e808c4
- CS EE Notes - DistilBERT Last Hidden State: https://chatgpt.com/c/a74b5448-22eb-47d4-8329-88f6e7e808c4

In [None]:
dataset_url = "https://huggingface.co/datasets/transformersbook/emotion-train-split/raw/main/train.txt"
!wget {dataset_url}

In [None]:
!head -n 1 train.txt

emotions_local = load_dataset("csv", data_files="train.txt", sep=";", names=["text", "label"])
emotions_local

# even simpler approach"
"""
dataset_url = "https://huggingface.co/datasets/transformersbook/emotion-train-split/raw/main/train.txt"
emotions_remote = load_dataset("csv", data_files=dataset_url, sep=";",
                               names=["text", "label"])
"""

In [None]:
import pandas as pd

emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
import matplotlib.pyplot as plt

df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False, showfliers=False,
           color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
# resetting the format here since we no longer need it anymore
emotions.reset_format()

In [None]:
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)

In [None]:
# numericalisation // converting each character here into a unique integer
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)
# printing out the "text" variable to idx
input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids)

In [None]:
# exemplar of mapping names to a unique ids // showing ordinal first. then one-hot encoding afterwards

categorical_df = pd.DataFrame(
    {"Name": ["Bumblebee", "Optimus Prime", "Megatron"], "Label ID": [0,1,2]})
print(categorical_df)

In [None]:
# one-hot encoding implementation:

pd.get_dummies(categorical_df["Name"])

In [None]:
# now converting the original input_ids to one-hot encoding (nominal) instead of ordinal


import torch
import torch.nn.functional as F

input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
one_hot_encodings.shape

In [None]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

In [None]:
tokenized_text = text.split()
print(tokenized_text)

In [None]:
# alternatively // the code can also be retrieved using "from transformers import DistilBertTokenizer"
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) # AutoTokenizer belongs to larger set of "auto" classes. Basically allows for easy switching between different models.

In [None]:
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
print(tokenizer.model_max_length)
print(tokenizer.model_input_names) # helps to know what's the name of the fields which the model expects in its forward pass

In [None]:
### Tokenising the Whole Dataset

# will be using a lot of "map()" on DatasetDict object to tokenise the whole corpus <-- method provides convenient way to apply a function to each element in dataset ++ used to create new rows and columns

def tokenize(batch): # the method that will be applied to map()
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
#hide_input
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

In [None]:
# hide_output
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
print(emotions_encoded["train"].column_names)

# Training a Text Classifier

- Using feature extraction. Training from last hidden layer alone instead of modifying whole pipeline (+ including pretrained model == fine-tuning)



---



In [None]:
# GPT notes

"""
import torch
import torch.nn as nn

# Define the linear layer
linear_layer = nn.Linear(in_features=4, out_features=1)

# Example input tensor of shape [2, 4]
input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0],
                             [5.0, 6.0, 7.0, 8.0]])

# Pass the input through the linear layer
output_tensor = linear_layer(input_tensor)

# Print the output tensor
print("Output Tensor:", output_tensor)
print("Output Shape:", output_tensor.shape)
"""