## 2.1 データセット

### 2.1.1 はじめてのHugging Face Datasets

In [None]:
from datasets import load_dataset

emotions = load_dataset("emotion")

In [None]:
emotions

In [None]:
train_ds = emotions['train']
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds[0]

In [None]:
train_ds.column_names

In [None]:
print(train_ds.features)

In [None]:
print(train_ds[:5])

In [None]:
print(train_ds['text'][:5])

### 2.1.2 データセットからData Frame へ

In [None]:
import pandas as pd

emotions.set_format(type='pandas')
df = emotions['train'][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

### 2.1.3 クラス分布の確認

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(y="label_name", data=df,
              palette='Set2'
              )
plt.title("Frequency of Classes")
plt.show()

対策  
・少数派のクラスを無作為にオーバーサンプリングする  
・多数派のクラスを無作為にアンダーサンプリングする  
・表現が不十分なクラスに関して，より多くのラベル付きデータを収集する  

### 2.1.4 ツイートの長さはどのくらい?

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
sns.boxplot(x="label_name", y="Words Per Tweet", data=df, 
            showfliers=False,
            palette='Set2'
            )
plt.title("Words Per Tweet by Label")
plt.show()
     

In [None]:
emotions.reset_format()

## 2.2 テキストからトークンへ

### 2.2.1 文字トークン化

In [None]:
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)

In [None]:
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)

In [None]:
input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids)

In [None]:
categorical_df = pd.DataFrame(
    {"Name": ["Bumblebee", "Optimus Prime", "Megatron"], "Label ID": [0,1,2]})
categorical_df

In [None]:
pd.get_dummies(categorical_df["Name"])

In [None]:
import torch
import torch.nn.functional as F

input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
one_hot_encodings.shape

In [None]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

### 2.2.2 単語トークン化

In [None]:
tokenized_text = text.split()
print(tokenized_text)

### 2.2.3 サブワードトークン化

In [None]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

### 2.2.4 データセット全体のトークン化

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
print(emotions_encoded["train"].column_names)