# üì∞ 10 ‚Äî Text Classification (Sentiment/Topic Analysis)

**Bag-of-Words MLP for on-device text classification ‚Äî realistic for command parsing on ESP32**

| Property | Value |
|----------|-------|
| **Task** | News topic classification |
| **Categories** | World, Sports, Business, Sci/Tech |
| **Input** | 200-dim bag-of-words vector |
| **Architecture** | Dense(200‚Üí128) ‚Üí ReLU ‚Üí Dense(128‚Üí64) ‚Üí ReLU ‚Üí Dense(64‚Üí4) |
| **MCU Memory** | ~34KB Flash + 8KB Arena |

### Why Bag-of-Words on MCU?
Full transformer models need MB of RAM. BoW + MLP needs <50KB total,
making it viable for command parsing, alert classification, and simple NLP on edge devices.


## Setup

In [None]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
from nano_rust_py.utils import quantize_to_i8, quantize_weights, calibrate_model
import nano_rust_py

CATEGORIES = ['World', 'Sports', 'Business', 'Sci/Tech']
VOCAB_SIZE = 200
N_CLASSES = len(CATEGORIES)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')


## Step 1: Generate Text Data

We use synthetic text with category-specific vocabulary distributions.
Each category has 24 distinctive words + 29 shared common words.
In production, replace with real AG News / custom dataset.


In [None]:
np.random.seed(42)

WORD_POOLS = {
    0: ['war','peace','president','election','government','minister','country','treaty',
        'united','nations','policy','crisis','diplomacy','summit','conflict','border',
        'refugee','military','security','alliance','vote','parliament','democracy','law'],
    1: ['game','team','player','score','win','match','champion','league','season','goal',
        'coach','tournament','final','record','olympic','medal','race','training',
        'stadium','football','basketball','tennis','soccer','athlete'],
    2: ['market','stock','price','company','profit','revenue','growth','economy','trade',
        'investment','bank','finance','quarter','earnings','share','billion','dollar',
        'ceo','merger','acquisition','startup','venture','inflation','tax'],
    3: ['software','computer','data','internet','technology','research','science','algorithm',
        'network','digital','system','device','robot','artificial','intelligence','quantum',
        'chip','cloud','cyber','innovation','machine','learning','neural','genome'],
}
COMMON = ['the','is','was','are','been','have','had','will','said','new','year','first',
          'also','would','could','after','more','about','between','has','their','from',
          'other','been','made','world','time','just','most']

def generate_text(n_per_class, n_words_range=(20, 50)):
    texts, labels = [], []
    for c in range(N_CLASSES):
        pool = WORD_POOLS[c]
        for _ in range(n_per_class):
            n = np.random.randint(*n_words_range)
            n_cat = int(n * 0.6)
            words = list(np.random.choice(pool, n_cat, replace=True)) + \
                    list(np.random.choice(COMMON, n - n_cat, replace=True))
            np.random.shuffle(words)
            texts.append(' '.join(words))
            labels.append(c)
    return texts, labels

train_texts, train_labels = generate_text(1000)
test_texts, test_labels = generate_text(200)

# Build vocabulary
word_counts = Counter()
for t in train_texts: word_counts.update(t.lower().split())
vocab = [w for w, _ in word_counts.most_common(VOCAB_SIZE)]
word2idx = {w: i for i, w in enumerate(vocab)}
print(f'Vocab: {len(vocab)} words | Train: {len(train_texts)} | Test: {len(test_texts)}')


In [None]:
def text_to_bow(text):
    bow = np.zeros(VOCAB_SIZE, dtype=np.float32)
    for w in text.lower().split():
        if w in word2idx: bow[word2idx[w]] += 1
    if bow.sum() > 0: bow /= bow.sum()
    return bow

X_train = np.array([text_to_bow(t) for t in train_texts])
X_test = np.array([text_to_bow(t) for t in test_texts])
y_train = np.array(train_labels, dtype=np.int64)
y_test = np.array(test_labels, dtype=np.int64)
idx = np.random.permutation(len(X_train))
X_train, y_train = X_train[idx], y_train[idx]

train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, pin_memory=True)
print('BoW features ready.')


## Step 2: Train MLP (20 epochs)

In [None]:
model = nn.Sequential(
    nn.Linear(VOCAB_SIZE, 128), nn.ReLU(),
    nn.Linear(128, 64),         nn.ReLU(),
    nn.Linear(64, N_CLASSES),
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss()
EPOCHS = 20

t0 = time.time()
for epoch in range(EPOCHS):
    model.train()
    correct, total = 0, 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model(features)
        criterion(out, labels).backward()
        optimizer.step()
        correct += out.argmax(1).eq(labels).sum().item()
        total += labels.size(0)
    if (epoch + 1) % 5 == 0:
        print(f'  Epoch {epoch+1}/{EPOCHS} ‚Äî Acc: {100.*correct/total:.1f}%')
train_time = time.time() - t0


## Step 3: Evaluate & Quantize

In [None]:
model.eval()
correct_pt = 0
with torch.no_grad():
    pt_pred = model(torch.from_numpy(X_test).to(device)).argmax(1).cpu().numpy()
correct_pt = (pt_pred == y_test).sum()
pt_acc = 100. * correct_pt / len(y_test)
print(f'PyTorch Test Accuracy: {pt_acc:.2f}%')


In [None]:
model_cpu = model.cpu().eval()
q_weights = quantize_weights(model_cpu)

global_max = float(np.max(np.abs(np.vstack([X_train, X_test]))))
global_scale = global_max / 127.0
print(f'Global scale: {global_scale:.6f}')

def quantize_global(data):
    return np.clip(np.round(data / global_scale), -128, 127).astype(np.int8)

cal_input = torch.from_numpy(X_test[:1])
requant = calibrate_model(model_cpu, cal_input, q_weights, global_scale)


## Step 4: NANO-RUST Test

In [None]:
def build_nano():
    nano = nano_rust_py.PySequentialModel(input_shape=[VOCAB_SIZE], arena_size=8192)
    m, s, bc = requant['0']
    nano.add_dense_with_requant(q_weights['0']['weights'].flatten().tolist(), bc, m, s)
    nano.add_relu()
    m, s, bc = requant['2']
    nano.add_dense_with_requant(q_weights['2']['weights'].flatten().tolist(), bc, m, s)
    nano.add_relu()
    m, s, bc = requant['4']
    nano.add_dense_with_requant(q_weights['4']['weights'].flatten().tolist(), bc, m, s)
    return nano

N_TEST = len(X_test)
correct_nano, match_count, max_diffs = 0, 0, []
t0 = time.time()
for i in range(N_TEST):
    q_feat = quantize_global(X_test[i])
    label = int(y_test[i])
    nano_out = build_nano().forward(q_feat.tolist())
    nano_cls = int(np.argmax(nano_out))
    with torch.no_grad():
        pt_out = model_cpu(torch.from_numpy(X_test[i:i+1])).numpy().flatten()
    pt_cls = int(np.argmax(pt_out))
    q_pt, _ = quantize_to_i8(pt_out)
    diff = np.abs(q_pt.astype(np.int32) - np.array(nano_out, dtype=np.int8).astype(np.int32))
    max_diffs.append(int(np.max(diff)))
    if nano_cls == label: correct_nano += 1
    if nano_cls == pt_cls: match_count += 1
infer_time = time.time() - t0


## üìä Results

In [None]:
nano_acc = 100. * correct_nano / N_TEST
agreement = 100. * match_count / N_TEST
total_w = sum(q['weights'].nbytes for q in q_weights.values())
print('=' * 60)
print('       TEXT CLASSIFICATION RESULTS')
print('=' * 60)
print(f'Categories: {", ".join(CATEGORIES)}')
print(f'Vocab size: {VOCAB_SIZE}')
print(f'PyTorch Accuracy:     {pt_acc:.2f}%')
print(f'NANO-RUST Accuracy:   {nano_acc:.2f}% (n={N_TEST})')
print(f'Classification Match: {agreement:.1f}%')
print(f'Max Diff (median):    {int(np.median(max_diffs))}')
print(f'Memory: {total_w:,} bytes ({total_w/1024:.1f}KB) + 8KB arena')
print(f'Fits ESP32? {"YES" if total_w + 8192 < 520*1024 else "NO"}')
print('=' * 60)
print(f'{"‚úÖ PASS" if agreement > 85 else "‚ùå FAIL"}: {agreement:.1f}% agreement')


## üìù Key Takeaways

- Bag-of-Words + MLP is a viable NLP approach for MCUs
- Total model size ~34KB ‚Äî fits comfortably on ESP32
- Can be used for: command parsing, alert classification, spam filtering
- For more complex NLP, consider DistilBERT ‚Üí knowledge distillation ‚Üí small MLP
