In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

from huggingface_hub import login

# Fetch the token securely from Colab secrets
hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    raise ValueError("HF_TOKEN not found. Please add it via Colab secrets.")

login(token=hf_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
import re
import unicodedata

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)                # Normalize Unicode
    text = text.replace("’", "'")                             # Fix smart apostrophe
    text = text.replace("“", '"').replace("”", '"')           # Fix smart quotes
    text = text.replace("–", "-").replace("—", "-")           # Fix dashes
    text = re.sub(r"[^\x00-\x7F]+", " ", text)                # Remove any remaining non-ASCII
    text = re.sub(r"\s+", " ", text).strip()                  # Collapse whitespace
    return text

In [3]:
import pandas as pd

def load_corpus(file_path: str) -> list:
    df = pd.read_csv(file_path, encoding="latin-1")
    text_data = df["v2"].dropna().astype(str).tolist()
    text_data = [normalize_text(t) for t in text_data]
    return text_data

corpus = load_corpus("data/spam.csv")
print(f"✅ Loaded {len(corpus)} samples")
print(f"Example: {corpus[0]}")

✅ Loaded 5572 samples
Example: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [4]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
import os

def train_tokenizer(corpus: list, output_dir: str, vocab_size: int = 10000) -> ByteLevelBPETokenizer:
    """
    Trains a Byte-Level BPE tokenizer on a given corpus.

    Args:
        corpus (list): List of text strings.
        output_dir (str): Path to save tokenizer files.
        vocab_size (int): Vocabulary size to train on.

    Returns:
        ByteLevelBPETokenizer: Trained tokenizer.
    """
    os.makedirs(output_dir, exist_ok=True)

    tokenizer = ByteLevelBPETokenizer()
    tokenizer._tokenizer.pre_tokenizer = ByteLevel()


    tokenizer.train_from_iterator(
        corpus,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )
    tokenizer.save_model(output_dir)
    tokenizer.save(f"{output_dir}/tokenizer.json")  # ✅ Required for Hugging Face compatibility
    return tokenizer

tokenizer = train_tokenizer(corpus, output_dir="my_tokenizer")







---

### 🧠 Step 1: Initialize the tokenizer

```python
tokenizer = ByteLevelBPETokenizer()
tokenizer._tokenizer.pre_tokenizer = ByteLevel()
```

* `ByteLevelBPETokenizer()` initializes a tokenizer that:

  * Operates at the byte level (good for handling rare or multilingual text).
  * Uses Byte Pair Encoding (BPE) to learn subword units.

* `tokenizer._tokenizer.pre_tokenizer = ByteLevel()` sets the **pre-tokenizer** to `ByteLevel`, which:

  * Encodes the input text into bytes first.
  * Handles spaces explicitly by marking them with a special character like `Ġ` (important for preserving word boundaries).

⚠️ Note: This low-level `. _tokenizer.pre_tokenizer` usage is necessary because `ByteLevelBPETokenizer` uses a legacy API; the higher-level abstraction is `tokenizer.pre_tokenizer`.

---

### 🧪 Step 2: Train the tokenizer

```python
tokenizer.train_from_iterator(
    corpus,
    vocab_size=vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)
```

This tells the tokenizer to:

* Train using the given `corpus` (an iterator of texts).
* Limit the vocab size to `vocab_size`.
* Ignore tokens that occur less than `min_frequency=2`.
* Show progress during training.
* Add 5 **special tokens** often used in NLP:

  * `<s>` = start of sentence
  * `</s>` = end of sentence
  * `<pad>` = padding
  * `<unk>` = unknown token
  * `<mask>` = for masked language modeling

---

### 💾 Step 3: Save the tokenizer model

```python
tokenizer.save_model(output_dir)
tokenizer.save(f"{output_dir}/tokenizer.json")
```

* `save_model()` saves the `vocab.json` and `merges.txt` files — required for legacy loading.
* `save("tokenizer.json")` saves a **Hugging Face compatible** single-file JSON format, usable by `PreTrainedTokenizerFast`.

---

### ✅ Final Output

```python
return tokenizer
```

Returns the trained `ByteLevelBPETokenizer` object so it can be reused if needed.

---

### 🔄 Summary of Output Files in `output_dir`

| File             | Purpose                                |
| ---------------- | -------------------------------------- |
| `vocab.json`     | Maps tokens to IDs                     |
| `merges.txt`     | Contains merge rules learned by BPE    |
| `tokenizer.json` | Hugging Face-compatible tokenizer file |

---

In [5]:
from transformers import PreTrainedTokenizerFast

def load_fast_tokenizer(tokenizer_dir: str) -> PreTrainedTokenizerFast:
    """
    Loads tokenizer files into a PreTrainedTokenizerFast object.

    Args:
        tokenizer_dir (str): Directory where tokenizer files are saved.

    Returns:
        PreTrainedTokenizerFast: Hugging Face compatible tokenizer.
    """
    return PreTrainedTokenizerFast(
        tokenizer_file=f"{tokenizer_dir}/tokenizer.json",
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
        mask_token="<mask>"
    )

fast_tokenizer = load_fast_tokenizer("my_tokenizer")

The `fast_tokenizer` here refers to a **Hugging Face-compatible tokenizer** that is created by wrapping the trained tokenizer files (`tokenizer.json`, `vocab.json`, `merges.txt`) using the class:

```python
from transformers import PreTrainedTokenizerFast
```

---

### 💡 What is `PreTrainedTokenizerFast`?

`PreTrainedTokenizerFast` is a **fast implementation of Hugging Face tokenizers** using the [🤗 `tokenizers` Rust-based library](https://github.com/huggingface/tokenizers), which is:

* ⚡ **Much faster** than Python-based tokenizers.
* ✅ Fully compatible with `transformers` models.
* 🧠 Capable of handling everything: tokenization, detokenization, ID mapping, padding, truncation, etc.
* 🧩 Works with any tokenizer trained and saved as a `tokenizer.json`.

---

### 🔄 What the code does

```python
fast_tokenizer = load_fast_tokenizer("my_tokenizer")
```

This loads the custom tokenizer you trained earlier with `ByteLevelBPETokenizer` and wraps it into a Hugging Face-compatible format.

The important part is here:

```python
PreTrainedTokenizerFast(
    tokenizer_file=f"{tokenizer_dir}/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)
```

This:

* Loads `tokenizer.json` (which contains vocab, merges, config).
* Defines all special tokens to work seamlessly with Transformer models.

Now, you can use `fast_tokenizer` just like any pretrained tokenizer (e.g., `BertTokenizerFast`, `GPT2TokenizerFast`, etc.):

```python
text = "Hello world!"
tokens = fast_tokenizer.tokenize(text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)
decoded = fast_tokenizer.decode(ids)

print(tokens)  # ['H', 'ello', 'Ġworld', '!']
print(ids)     # [id1, id2, id3, id4]
print(decoded) # 'Hello world!'
```

---

### ✅ Summary

| Concept                   | What it does                                                               |
| ------------------------- | -------------------------------------------------------------------------- |
| `PreTrainedTokenizerFast` | A wrapper to load and use a tokenizer trained with `tokenizers`            |
| `tokenizer.json`          | The tokenizer model (vocab, merges, config)                                |
| `fast_tokenizer`          | The fully functional tokenizer you can now use in your pipeline            |
| Benefit                   | Fast, efficient, and works with Hugging Face `transformers` out-of-the-box |

In [6]:
from huggingface_hub import HfApi

HF_USERNAME = HfApi().whoami()["name"]
REPO_NAME = "spam-tokenizer-bpe"
TOKENIZER_REPO = f"{HF_USERNAME}/{REPO_NAME}"

fast_tokenizer.save_pretrained("my_tokenizer")  # includes config files
fast_tokenizer.push_to_hub(REPO_NAME)

print(f"✅ Tokenizer pushed: https://huggingface.co/{TOKENIZER_REPO}")

No files have been modified since last commit. Skipping to prevent empty commit.


✅ Tokenizer pushed: https://huggingface.co/mushfiqurrobin/spam-tokenizer-bpe


In [7]:
text = "Congratulations! You’ve won a free ticket to Bahamas."
tokens = fast_tokenizer.tokenize(text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Text:", text)
print("Tokens:", tokens)
print("Token IDs:", ids)

Text: Congratulations! You’ve won a free ticket to Bahamas.
Tokens: ['C', 'ongrat', 'ulations', '!', 'ĠYou', 'â', 'Ģ', 'Ļ', 've', 'Ġwon', 'Ġa', 'Ġfree', 'Ġticket', 'Ġto', 'ĠBahamas', '.']
Token IDs: [39, 1322, 2254, 5, 410, 163, 227, 252, 294, 689, 262, 645, 4175, 276, 9676, 18]


In [8]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the directory
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained("my_tokenizer")

# New text to tokenize
text = "Hello! This is a different sentence to tokenize."

# Tokenize and get token IDs
tokens = fast_tokenizer.tokenize(text)
# print([t.replace("Ġ", "") for t in tokens])
token_ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['H', 'ello', '!', 'ĠThis', 'Ġis', 'Ġa', 'Ġdifferent', 'Ġsentence', 'Ġto', 'Ġtok', 'en', 'ize', '.']
Token IDs: [44, 7284, 5, 839, 327, 262, 2933, 6447, 276, 5917, 307, 2300, 18]


The `Ġ` character you're seeing in the token strings (e.g., `'ĠThis'`, `'Ġdifferent'`, `'Ġsentence'`) is **not an actual character in the final tokenized text**—it's just a **display artifact** from how some tokenizers (especially Byte-Level BPE tokenizers and GPT-style tokenizers) represent **word boundaries**.

### 🔍 What `Ġ` Means

* `Ġ` (Unicode U+0120) is used by some tokenizers to indicate that the token starts with a **space**.
* It helps distinguish between:

  * `"Ġtoken"` → the word "token" preceded by a space.
  * `"token"` → the word "token" without a preceding space (e.g., part of a longer word like `"notoken"`).

### ✅ Why It's Useful

This distinction allows the tokenizer to:

* **Model spaces explicitly**, which is important for text generation and reconstruction.
* **Preserve the structure of the input text** more accurately during training.

### 🧼 If You Want to Remove `Ġ` from Display

Just strip it during display if it's confusing:

```python
print([t.replace("Ġ", "") for t in tokens])
```

### ⚠️ But: Don't Remove It Before Encoding

The tokenizer uses `Ġ` internally to tokenize and reconstruct text correctly. You should **not** remove or alter it in any preprocessing unless you're just cleaning up **display/output**.

---

# creating a tokenizer on a HF dataset

In [9]:
from datasets import load_dataset
import os

dataset = load_dataset("SoorajK1/questions_and_answers")
os.makedirs("data", exist_ok=True)

dataset["train"].to_csv("data/questions_and_answers.csv", index=False)
print("Dataset saved to: data/questions_and_answers.csv")

Creating CSV from Arrow format: 100%|██████████| 30/30 [00:00<00:00, 66.45ba/s]

Dataset saved to: data/questions_and_answers.csv





In [10]:
import re
import unicodedata
import pandas as pd

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')
    text = text.replace("–", "-").replace("—", "-")
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [11]:
pd.read_csv("data/questions_and_answers.csv").columns

Index(['Unnamed: 0', 'question', 'answer', 'content_row'], dtype='object')

In [12]:
def load_corpus_from_csv(path: str) -> list:
    df = pd.read_csv(path)
    # Combine questions and answers
    combined_text = (df["question"].astype(str) + "\n" + df["answer"].astype(str)).tolist()
    combined_text = [normalize_text(t) for t in combined_text if t.strip()]
    return combined_text

corpus = load_corpus_from_csv("data/questions_and_answers.csv")
print(f"Loaded {len(corpus)} samples from Q&A dataset")
print(f"Example: {corpus[0]}")

Loaded 29438 samples from Q&A dataset
Example: What is the purpose of the Android App mentioned? The Android App mentioned in the text is designed to collect data using smartphones. It serves as a tool that allows farmers to input and store information related to their farming activities.


In [13]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.pre_tokenizers import ByteLevel

def train_tokenizer(corpus: list, output_dir: str, vocab_size: int = 20000) -> ByteLevelBPETokenizer:
    os.makedirs(output_dir, exist_ok=True)
    tokenizer = ByteLevelBPETokenizer()
    tokenizer._tokenizer.pre_tokenizer = ByteLevel()

    tokenizer.train_from_iterator(
        corpus,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )
    tokenizer.save_model(output_dir)
    tokenizer.save(f"{output_dir}/tokenizer.json")
    return tokenizer

tokenizer = train_tokenizer(corpus, "my_qa_tokenizer")
print("✅ Tokenizer trained and saved to 'my_qa_tokenizer'")




✅ Tokenizer trained and saved to 'my_qa_tokenizer'


In [14]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="my_qa_tokenizer/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)

sample_text = "Yes, the Android App can be used offline, without requiring an internet connection for data collection. \
    Farmers can input data into the app even in remote areas where internet access may be limited. \
    The data collected offline can be synced and uploaded to the app's server once an internet connection becomes available."
tokens = fast_tokenizer.tokenize(sample_text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Sample text:", sample_text)
print("Tokens:", [t.replace("Ġ", "") for t in tokens])
print("Token IDs:", ids)

Sample text: Yes, the Android App can be used offline, without requiring an internet connection for data collection.     Farmers can input data into the app even in remote areas where internet access may be limited.     The data collected offline can be synced and uploaded to the app's server once an internet connection becomes available.
Tokens: ['Y', 'es', ',', 'the', 'Android', 'App', 'can', 'be', 'used', 'off', 'line', ',', 'without', 'requiring', 'an', 'internet', 'connection', 'for', 'data', 'collection', '.', '', '', '', '', 'Farmers', 'can', 'input', 'data', 'into', 'the', 'app', 'even', 'in', 'remote', 'areas', 'where', 'internet', 'access', 'may', 'be', 'limited', '.', '', '', '', '', 'The', 'data', 'collected', 'off', 'line', 'can', 'be', 'syn', 'ced', 'and', 'uploaded', 'to', 'the', 'app', "'s", 'ser', 'ver', 'once', 'an', 'internet', 'connection', 'becomes', 'available', '.']
Token IDs: [61, 275, 16, 266, 8822, 2619, 342, 312, 524, 1078, 3138, 16, 1365, 8573, 286, 13133, 9

In [15]:
from huggingface_hub import HfApi

HF_USERNAME = HfApi().whoami()["name"]
REPO_NAME = "qa-tokenizer-bpe"
TOKENIZER_REPO = f"{HF_USERNAME}/{REPO_NAME}"

fast_tokenizer.save_pretrained("my_qa_tokenizer")  # includes config files
fast_tokenizer.push_to_hub(REPO_NAME)

print(f"✅ Tokenizer pushed: https://huggingface.co/{TOKENIZER_REPO}")

No files have been modified since last commit. Skipping to prevent empty commit.


✅ Tokenizer pushed: https://huggingface.co/mushfiqurrobin/qa-tokenizer-bpe


In [16]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the directory
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained("my_qa_tokenizer")

text = "Building a tokenizer from scratch might seem daunting at first, but with the right tools and a step-by-step approach, it's quite manageable."

tokens = fast_tokenizer.tokenize(text)
token_ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", [t.replace("Ġ", "") for t in tokens])
print("Token IDs:", token_ids)

Tokens: ['B', 'uilding', 'a', 'to', 'ken', 'izer', 'from', 'scrat', 'ch', 'might', 'seem', 'd', 'a', 'unting', 'at', 'first', ',', 'but', 'with', 'the', 'right', 'tools', 'and', 'a', 'step', '-', 'by', '-', 'step', 'approach', ',', 'it', "'s", 'quite', 'manageable', '.']
Token IDs: [38, 10743, 264, 295, 3743, 7545, 453, 14266, 359, 5625, 15894, 294, 69, 12043, 477, 1273, 16, 1418, 417, 266, 2379, 5852, 292, 264, 2011, 17, 3723, 17, 13507, 1926, 16, 376, 821, 12520, 15040, 18]


# fine tuning a model with the tokenizer

In [17]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="my_qa_tokenizer/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)

In [18]:
import re
import unicodedata
import pandas as pd

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')
    text = text.replace("–", "-").replace("—", "-")
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.strip()


def load_corpus_from_csv(path: str) -> list:
    df = pd.read_csv(path)
    # Combine questions and answers
    combined_text = (df["question"].astype(str) + "\n" + df["answer"].astype(str)).tolist()
    combined_text = [normalize_text(t) for t in combined_text if t.strip()]
    return combined_text

corpus = load_corpus_from_csv("data/questions_and_answers.csv")
print(f"Loaded {len(corpus)} samples from Q&A dataset")
print(f"Example: {corpus[0]}")

Loaded 29438 samples from Q&A dataset
Example: What is the purpose of the Android App mentioned? The Android App mentioned in the text is designed to collect data using smartphones. It serves as a tool that allows farmers to input and store information related to their farming activities.


In [19]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples,
        truncation=True,
        max_length=128,
        padding="max_length"
    )

tokenized_dataset = list(map(tokenize_function, corpus))
print(tokenized_dataset[0])

{'input_ids': [59, 302, 298, 266, 655, 282, 266, 8822, 2619, 1396, 35, 320, 8822, 2619, 1396, 287, 266, 1017, 298, 3429, 295, 1761, 1537, 770, 10322, 18356, 18, 485, 3102, 366, 264, 8696, 397, 1186, 537, 295, 1492, 292, 4553, 1267, 1892, 295, 497, 551, 1191, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [20]:
# Convert to HuggingFace Dataset
from datasets import Dataset

dataset_dict = {
    "input_ids": [x["input_ids"] for x in tokenized_dataset],
    "attention_mask": [x["attention_mask"] for x in tokenized_dataset],
}

train_dataset = Dataset.from_dict(dataset_dict)

### 🧠 Context:

You're preparing a dataset for training a model using Hugging Face’s `transformers` library — and more specifically, using their `Trainer` API. For that, your data must be in a `datasets.Dataset` format (from the `datasets` library, also known as 🤗 Datasets).

---

### 🔍 Line-by-line explanation:

```python
from datasets import Dataset
```

* This imports the `Dataset` class from the 🤗 `datasets` library.

---

```python
dataset_dict = {
    "input_ids": [x["input_ids"] for x in tokenized_dataset],
    "attention_mask": [x["attention_mask"] for x in tokenized_dataset],
}
```

* `tokenized_dataset` is likely a list of tokenized examples, where each example is a dictionary with keys like `"input_ids"` and `"attention_mask"` — output from a tokenizer.
* This line is **constructing a dictionary** where:

  * Each key (e.g. `"input_ids"`) maps to a list of values — one per example.
  * You're restructuring the data into **columnar format**, i.e., `{"input_ids": [...], "attention_mask": [...]}` instead of a list of dicts.

---

```python
train_dataset = Dataset.from_dict(dataset_dict)
```

* This converts the `dataset_dict` into a Hugging Face `Dataset` object.
* Now `train_dataset` is a proper `datasets.Dataset`, which you can use with `Trainer`, apply `.map()` for transformations, shuffle it, etc.

---

### ✅ Why do this?

Hugging Face’s `Trainer` expects a `datasets.Dataset` object for training — not just a list of Python dicts. This transformation allows seamless integration with Hugging Face’s training utilities.

---

### 🧪 Example Input/Output

If `tokenized_dataset` looked like:

```python
[
  {'input_ids': [101, 102], 'attention_mask': [1, 1]},
  {'input_ids': [101, 103], 'attention_mask': [1, 1]}
]
```

Then `dataset_dict` becomes:

```python
{
  'input_ids': [[101, 102], [101, 103]],
  'attention_mask': [[1, 1], [1, 1]]
}
```

Which is then converted to a `Dataset` object you can train on.

---

In [None]:
train_dataset[0]

In [22]:
# Define GPT-2 config and model
from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4,
)

model = GPT2LMHeadModel(config)

This cell is **defining and initializing a custom GPT-2 model configuration and model**, tailored for a smaller, resource-friendly version of GPT-2. Here's a breakdown:

---

### 📦 Imports

```python
from transformers import GPT2Config, GPT2LMHeadModel
```

* `GPT2Config`: lets you **customize model hyperparameters** (like number of layers, embedding size, etc).
* `GPT2LMHeadModel`: the GPT-2 model class **with a language modeling (LM) head**, used for **causal language modeling** tasks (predicting next token given previous tokens).

---

### ⚙️ Define Custom GPT-2 Configuration

```python
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,        # Size of the vocabulary (must match tokenizer)
    bos_token_id=tokenizer.bos_token_id,    # Beginning of sequence token ID
    eos_token_id=tokenizer.eos_token_id,    # End of sequence token ID
    n_positions=128,                        # Max sequence length
    n_ctx=128,                              # Max context window (same as above)
    n_embd=256,                             # Embedding size (how big each token vector is)
    n_layer=4,                              # Number of transformer layers (depth)
    n_head=4                                # Number of attention heads
)
```

✅ You're **defining a "mini GPT-2"** — small enough to train quickly on limited hardware (ideal for fine-tuning on custom or educational datasets).

---

### 🧠 Instantiate the Model

```python
model = GPT2LMHeadModel(config)
```

* This creates a **GPT-2 model** initialized **from scratch** (i.e., not loaded from pre-trained weights) using your custom config.
* It's a **decoder-only transformer** for **causal language modeling**, suitable for tasks like text generation, next-token prediction, etc.

---

### Summary

| Component     | Value Set Here | What It Means                        |
| ------------- | -------------- | ------------------------------------ |
| `n_positions` | 128            | Max sequence length (tokens)         |
| `n_embd`      | 256            | Dimensionality of token embeddings   |
| `n_layer`     | 4              | Number of transformer blocks         |
| `n_head`      | 4              | Multi-head attention heads           |
| `vocab_size`  | From tokenizer | Vocabulary size the model can handle |

---

### ✅ This is ideal when:

* You're training GPT-2 **from scratch** on a small dataset.
* You want a **smaller, faster, cheaper** model for prototyping or testing.

Probable parameter counts:

---

### 🔧 Model Configuration Recap

* `n_layer = 4`
* `n_head = 4`
* `n_embd = 256`
* `n_ctx = 128`
* `vocab_size = 20,000`

---

### 📦 Parameter Count by Component

#### 1. **Embeddings**

* **Token Embeddings**: `vocab_size × n_embd = 20,000 × 256 = 5,120,000`
* **Positional Embeddings**: `n_ctx × n_embd = 128 × 256 = 32,768`

✅ **Embeddings Total = \~5.15M**

---

#### 2. **Transformer Blocks (x4 layers)**

Each layer has:

* **Self-Attention**: 4 weight matrices (Q, K, V, Out), each `n_embd × n_embd = 256 × 256 = 65,536`

  * Total = 4 × 65,536 = 262,144
* **Feedforward MLP**:

  * First layer: `256 × 1024 = 262,144`
  * Second layer: `1024 × 256 = 262,144`
  * Total = 524,288

➡️ **Per Layer Total** ≈ 262K + 524K = **786,432**
➡️ **4 Layers Total** = 4 × 786,432 = **3,145,728**

---

#### 3. **Final LayerNorm**: \~256 parameters (very small)

---

#### 4. **LM Head**: Tied with token embeddings, no extra params.

---

### 📊 Final Total Parameter Count

| Component   | Parameters        |
| ----------- | ----------------- |
| Embeddings  | 5,152,768         |
| Transformer | 3,145,728         |
| LayerNorm   | \~256             |
| **Total**   | **\~8.3 million** |

---

### 🧠 Summary

> ✅ **Your GPT-2 model has \~8.3 million parameters** with a vocab size of 20,000 and 4 transformer layers.

In [23]:
import torch
import transformers
import accelerate

print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)

torch: 2.6.0+cu124
transformers: 4.52.4
accelerate: 1.7.0


In [24]:
# Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-custom",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=20,
    save_steps=50,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

This cell defines a set of **training configurations** for fine-tuning your custom GPT-2 model using Hugging Face's `Trainer` API. It uses the `TrainingArguments` class from the `transformers` library.

---

## 🔧 Breakdown of Each Argument

```python
training_args = TrainingArguments(
    output_dir="./gpt2-custom",
```

* 📁 Where to save the model checkpoints, logs, etc.

```python
    overwrite_output_dir=True,
```

* ✅ If this directory already exists, it will be **overwritten**. Useful when you're re-training.

```python
    per_device_train_batch_size=8,
```

* 🧠 Number of examples processed **per GPU/CPU** in a single forward/backward pass.
* If you're using 1 GPU: batch size = 8
* If using 2 GPUs: effective batch size = 8 × 2 = 16

```python
    num_train_epochs=3,
```

* 🔁 Train the entire dataset **3 times** (epochs)

```python
    logging_dir="./logs",
```

* 📓 Directory to save logs for TensorBoard or manual inspection

```python
    logging_steps=20,
```

* 🪵 Log training metrics (loss, learning rate, etc.) every 20 steps

```python
    save_steps=50,
```

* 💾 Save a model checkpoint every 50 steps

```python
    save_total_limit=2,
```

* 💼 Keep only the **2 most recent checkpoints** to save disk space
* Older checkpoints are automatically deleted

```python
    fp16=True,
```

* 🧮 Enables **mixed-precision (FP16)** training, reducing memory usage and speeding up training on supported GPUs

```python
    report_to="none"
)
```

* 🛑 Disables reporting to external logging systems (like WandB, Comet, etc.)

---

## ✅ Why This Is Useful

This config:

* Is compact and memory-efficient (good for small models or laptops)
* Saves space by limiting checkpoint storage
* Enables fast training with FP16
* Prepares you to use `Trainer(...)` without manually handling loops, device setup, or logging

---

In [25]:
# Trainer API to fine-tune
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # CLM = not MLM
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,9.3421
40,8.92
60,8.6563
80,8.3291
100,8.0477
120,7.8312
140,7.6712
160,7.3876
180,7.2484
200,7.1067


TrainOutput(global_step=11040, training_loss=5.080801551929419, metrics={'train_runtime': 4867.237, 'train_samples_per_second': 18.145, 'train_steps_per_second': 2.268, 'total_flos': 214297094651904.0, 'train_loss': 5.080801551929419, 'epoch': 3.0})

This cell fine-tunes your **custom GPT-2 model** using the Hugging Face `Trainer` API — a high-level training loop abstraction that takes care of all the boilerplate (optimizer, scheduling, GPU management, etc.).

---

## 🔍 Explanation Line-by-Line

### 1. **Importing Necessary Tools**

```python
from transformers import Trainer, DataCollatorForLanguageModeling
```

* `Trainer`: Core API for training Hugging Face models.
* `DataCollatorForLanguageModeling`: Prepares batches of tokenized inputs for language modeling tasks (like GPT-2).

---

### 2. **Creating a Data Collator**

```python
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # CLM = not MLM
)
```

* Tells the trainer **how to batch** sequences.
* `mlm=False` → you're doing **causal language modeling (CLM)**, **not** masked language modeling (MLM).

  * MLM is used for BERT.
  * CLM is used for GPT-style models.

---

### 3. **Defining the Trainer**

```python
trainer = Trainer(
    model=model,  # Your GPT-2 variant
    args=training_args,  # Defined earlier
    train_dataset=train_dataset,  # Tokenized & formatted data
    tokenizer=tokenizer,  # Needed for decoding/evaluation
    data_collator=data_collator  # Prepares padded, shifted batches
)
```

* Sets up the full training pipeline using:

  * Your model
  * Your training config (`TrainingArguments`)
  * Your tokenized training data
  * A collator to create batches on the fly

---

### 4. **Training the Model**

```python
trainer.train()
```

* Begins training:

  * Loads data in batches
  * Runs forward and backward passes
  * Applies optimizer updates
  * Logs metrics
  * Saves checkpoints

---

## ✅ Summary

This cell:

* ✅ Automatically manages your training loop
* ✅ Uses GPT-2 style language modeling (next token prediction)
* ✅ Simplifies fine-tuning with minimal custom code

In [26]:
# Save to Hugging Face Hub or local
trainer.save_model("gpt2-custom")
tokenizer.save_pretrained("gpt2-custom")

('gpt2-custom/tokenizer_config.json',
 'gpt2-custom/special_tokens_map.json',
 'gpt2-custom/tokenizer.json')

This cell **saves your fine-tuned GPT-2 model and tokenizer** to a local directory (`gpt2-custom`) so you can reload and use it later.

---

### 🔍 Line-by-Line Explanation

#### ✅ Save the fine-tuned model

```python
trainer.save_model("gpt2-custom")
```

* Saves:

  * The **model weights** (`pytorch_model.bin`)
  * The **model config** (`config.json`)
* Location: local directory named `"gpt2-custom"`

You can later reload it via:

```python
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("gpt2-custom")
```

---

#### ✅ Save the tokenizer

```python
tokenizer.save_pretrained("gpt2-custom")
```

* Saves:

  * The tokenizer files (e.g., `tokenizer.json`, `vocab.json`, `merges.txt`)
  * A config file (`tokenizer_config.json`)

You can reload it using:

```python
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-custom")
```

---

### 🧠 Why this matters:

Saving both the **model** and **tokenizer** together ensures:

* You can **reload** the model later for inference or further training.
* Anyone else (or yourself in a future session) can reuse the model without retraining.

---

# generation

In [41]:
from transformers import pipeline
pipe = pipeline("text-generation", model="gpt2-custom", tokenizer=tokenizer, device=-1)
pipe("Once upon a time", max_new_tokens=10)

Device set to use cpu


[{'generated_text': "Once upon a time for a business's support to be applied to the"}]


---

### 🧠 Why use `pipeline`?

* It's **quick** and **convenient** for inference/testing.
* Abstracts away the need to manually tokenize and decode text.

---

In [47]:
output = pipe(
    "Once upon a time",
    max_length=10,     # max tokens generated (including prompt length)
    temperature=0.7,   # randomness: lower is more conservative, higher more creative
    top_p=0.9,         # nucleus sampling, keep tokens with cumulative prob up to 0.9
    num_return_sequences=1,  # number of generated sequences
    max_new_tokens=64,
)

print(output[0]['generated_text'])

Both `max_new_tokens` (=64) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Once upon a time to be used for the second kilogram of a rate of a rate of 1.0 kg per liter of 1.5 kg of water. This method should be applied to control the seed rate of 3. This method is to be applied as per hectare. Additionally, it is the use of 1.5 kg per hectare


# Gradio web app

In [50]:
import gradio as gr
from transformers import pipeline

pipe = pipeline("text-generation", model="gpt2-custom", tokenizer="gpt2-custom", device=-1)

def generate_text(prompt, max_length=64, temperature=0.7, top_p=0.9):
    outputs = pipe(
        prompt,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        num_return_sequences=1,
        max_new_tokens=30
    )
    return outputs[0]['generated_text']

# Define Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter prompt here..."),
        gr.Slider(10, 200, value=50, label="Max Length"),
        gr.Slider(0.1, 1.0, value=0.7, label="Temperature"),
        gr.Slider(0.1, 1.0, value=0.9, label="Top-p (nucleus sampling)"),
    ],
    outputs="text",
    title="Custom GPT-2 Text Generation",
    description="Generate text with your fine-tuned GPT-2 model. Adjust parameters to control creativity."
)

iface.launch()

Device set to use cpu


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=30) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=30) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=30) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the

training the model for more epochs will improve the result. 

What you’re seeing is a **classic symptom of a small, undertrained language model** — here's why it's happening and what you can do about it.

---

### 🔍 Why This Is Happening

You're using a **tiny custom GPT-2 model** trained:

* From scratch (no pretrained weights),
* With a **small number of layers** (4 layers, 256 embedding dim),
* For **just 3 epochs**, likely on limited or domain-specific data (e.g., agriculture-related or technical instructions).

Because of this:

* The model **hasn't learned general language patterns**.
* It **memorizes patterns** or phrases from training data (e.g., "per hectare", "kg per liter", "seed rate").
* It **regurgitates phrases** even on unrelated prompts due to **overfitting** and **lack of generalization**.

So even if you say `"Once upon a time"` or `"Explain quantum physics"`, it still babbles about kilograms and hectares.

---


### ✅ How to Fix or Improve It

#### **1. Use Pretrained GPT-2 + Fine-tune**

Instead of training from scratch, load `gpt2` (124M) from Hugging Face and fine-tune on your data:

```python
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")  # Load pretrained weights
model.resize_token_embeddings(len(tokenizer))    # Resize for custom tokenizer
```

This way, the model starts with **general language understanding** and just learns your domain.

---

#### **2. Increase Dataset Size + Training Time**

If you **must train from scratch**, then:

* Increase dataset size (ideally **millions of tokens**)
* Train for **at least 10+ epochs**
* Use **FP32** (not FP16) unless you're sure the model is stable
* Add more randomness or augmentation in the data

---

#### **3. Prompt the Model Differently**

Sometimes, providing a more structured or forced prompt helps:

```python
pipe("Write a short story about a dragon and a mountain. Once upon a time", max_new_tokens=50)
```

This forces the model away from its memorized phrasing (though with limited success if it's severely overfit).

---

#### **4. Inspect Training Data Bias**

Check if your training data contains lots of agricultural-style instructional text. That would explain the output you’re getting. The model is just **repeating what it saw most often**.

---

### 🧠 TL;DR

* Your model outputs agriculture-like gibberish because it's **small, undertrained, and overfit**.
* The best fix is to **start from pretrained GPT-2**, then fine-tune.
* Alternatively, collect more **diverse and larger** data, and train for longer.


# folder structure explanation for Trainer

The folder structure you're showing is the **output of a Hugging Face `Trainer` fine-tuning run** for a GPT-2 model. Let me break down the structure and explain what each file or folder does:

---

## 📁 `gpt2-custom/`

This is your **main output directory**, passed via:

```python
TrainingArguments(output_dir="./gpt2-custom")
```

### ✅ Top-level files

These represent the **final saved model state** (after training completes or the last checkpoint):

| File                                       | Purpose                                                                                            |
| ------------------------------------------ | -------------------------------------------------------------------------------------------------- |
| `config.json`                              | GPT-2 model architecture (layers, heads, vocab size, etc.).                                        |
| `pytorch_model.bin` or `model.safetensors` | Final trained model weights. If you chose `safetensors`, it's saved securely and more efficiently. |
| `generation_config.json`                   | Settings for text generation (e.g., temperature, top\_k, max tokens).                              |
| `tokenizer_config.json`                    | Tokenizer configuration (like special tokens).                                                     |
| `tokenizer.json`                           | Full serialized tokenizer with merges/vocab.                                                       |
| `special_tokens_map.json`                  | Maps token names (`<bos>`, `<eos>`, `<pad>`) to actual IDs.                                        |
| `training_args.bin`                        | Your full `TrainingArguments` object saved as a binary.                                            |
| `trainer_state.json`                       | Training progress: steps done, logs, metrics.                                                      |

---

## 📁 `checkpoint-11040/`

This is a **checkpoint saved mid-training** — in your case, at step 11,040 — because you specified:

```python
save_steps=50
```

So every 50 steps, Hugging Face Trainer saves a checkpoint.

### ✅ Files in `checkpoint-11040/`

| File                                       | Purpose                                                             |
| ------------------------------------------ | ------------------------------------------------------------------- |
| `config.json`                              | Copy of model architecture at this step.                            |
| `model.safetensors` or `pytorch_model.bin` | Model weights at this checkpoint.                                   |
| `optimizer.pt`                             | Optimizer state (AdamW, etc.) so training can resume exactly.       |
| `scheduler.pt`                             | Learning rate scheduler state.                                      |
| `scaler.pt`                                | For mixed-precision (FP16) training — stores gradient scaling info. |
| `rng_state.pth`                            | RNG seed state (PyTorch, NumPy, Python) for exact reproducibility.  |
| `trainer_state.json`                       | Metrics and status of this specific checkpoint.                     |

---

### 🔁 Why this is useful

* You can resume training from `checkpoint-11040/` if interrupted.
* You can load a **specific earlier checkpoint** for evaluation or debugging.
* The top-level `gpt2-custom/` contains the **final model** unless you stopped mid-run.

---

### ✅ Example: Load from a checkpoint

```python
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2-custom/checkpoint-11040")
```

Or load from the final trained model:

```python
model = GPT2LMHeadModel.from_pretrained("gpt2-custom")
```

---