In [6]:
from src.training.dataset import load_jsonl, build_label_maps, NaicsTextDataset
from transformers import AutoTokenizer

rows=load_jsonl('data/train.jsonl')
rows


[{'business_name': 'ACME Plumbing',
  'text': 'plumbing leak repair drain cleaning water heater installation',
  'label': '238220'},
 {'business_name': 'Bright Spark Electric',
  'text': 'electrical contractor wiring panel upgrade lighting installation',
  'label': '238210'},
 {'business_name': 'CoolAir HVAC',
  'text': 'heating air conditioning hvac furnace installation maintenance',
  'label': '238220'},
 {'business_name': 'RoofPro',
  'text': 'roofing contractor shingle replacement gutter repair storm damage',
  'label': '238160'},
 {'business_name': 'FixIt Industrial',
  'text': 'industrial equipment repair maintenance machine service technician',
  'label': '811310'},
 {'business_name': 'PipeSupply Wholesale',
  'text': 'plumbing supplies wholesale pipes valves fittings distributor',
  'label': '423720'},
 {'business_name': 'Drain Masters',
  'text': 'drain cleaning sewer line unclogging plumbing emergency service',
  'label': '238220'},
 {'business_name': 'PowerGrid Wiring',
  't

In [7]:
l2i,i2l=build_label_maps(rows)
print(l2i)
print(i2l)

{'238160': 0, '238210': 1, '238220': 2, '423720': 3, '811310': 4}
{0: '238160', 1: '238210', 2: '238220', 3: '423720', 4: '811310'}


In [8]:
tok=AutoTokenizer.from_pretrained('distilbert-base-uncased')
tok

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [9]:
ds=NaicsTextDataset(rows,l2i,tok,128)
ds

<src.training.dataset.NaicsTextDataset at 0x159f338e0>

In [10]:
x=ds[0]
x

{'input_ids': tensor([  101,  9353,  4168, 27902, 27902, 17271,  7192, 12475,  9344,  2300,
          3684,  2121,  8272,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [12]:
print(type(x), x['input_ids'].shape, x['attention_mask'].shape, x['labels'])

<class 'dict'> torch.Size([128]) torch.Size([128]) tensor(2)


In [13]:
from torch.utils.data import DataLoader

dl = DataLoader(ds, batch_size=4, shuffle=False)
batch = next(iter(dl))
print(batch["input_ids"].shape)       # torch.Size([4, 128])
print(batch["attention_mask"].shape)  # torch.Size([4, 128])
print(batch["labels"].shape)          # torch.Size([4])


torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4])


In [14]:
import json
from typing import List, Dict
from transformers import AutoTokenizer

# 假设你项目里有这个函数：build_label_maps(rows) -> (l2i, i2l)
# 假设你已经定义好了 NaicsTextDataset

def read_jsonl_head(path: str, n: int) -> List[Dict]:
    rows: List[Dict] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
            if len(rows) >= n:
                break
    return rows

rows = read_jsonl_head("data/train.jsonl", 5)

l2i, i2l = build_label_maps(rows)  # 你项目里一般是从 train rows 建

tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
ds = NaicsTextDataset(rows, l2i, tok, 128)

print(ds[0].keys())
print(ds[0]["input_ids"].shape)
print(ds[0]["labels"], ds[0]["labels"].dtype)


dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([128])
tensor(2) torch.int64


In [15]:
import torch

x = ds[0]
print(x["input_ids"].shape)
assert x["input_ids"].ndim == 1
assert x["labels"].dtype == torch.long
print("sanity check passed")


torch.Size([128])
sanity check passed


In [17]:
from torch.utils.data import DataLoader

dl = DataLoader(ds, batch_size=4, shuffle=False)
batch = next(iter(dl))
print(batch["input_ids"].shape)       # torch.Size([4, 128])
print(batch["attention_mask"].shape)  # torch.Size([4, 128])
print(batch["labels"].shape)          # torch.Size([4])
print(dl)


torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4])
<torch.utils.data.dataloader.DataLoader object at 0x159f30fd0>


In [18]:
from transformers import AutoModelForSequenceClassification

num_labels = len(l2i)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=i2l,      # 可选：让输出更可读（预测时显示 label 名）
    label2id=l2i,      # 可选：同上
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
##training


%python src.training.train configs/train.yaml

UsageError: Line magic function `%python` not found (But cell magic `%%python` exists, did you mean that instead?).


In [None]:
from src.naics_inference.inference import InferenceModel; 
m=InferenceModel('artifacts/distilbert_naics'); 
print(m.predict('ACME Plumbing leak repair drain cleaning water heater', topk=3))


In [15]:
import importlib
import src.naics_inference.inference as inf

importlib.reload(inf)

from src.naics_inference.inference import InferenceModel
m = InferenceModel("artifacts/distilbert_naics")
print(m.predict("ACME Plumbing", "leak repair drain cleaning water heater", topk=3))




default_max_length: 128
label_list len: 5 sample: ['238160', '238210', '238220']
<class 'list'> 5 ['238160', '238210', '238220']
input_ids shape: torch.Size([1, 128])
mask tail: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
logits shape: (5,) top_idx: [2, 1, 0]
[Prediction(code='238220', prob=0.32129210233688354), Prediction(code='238210', prob=0.18525734543800354), Prediction(code='238160', prob=0.17992505431175232)]


In [10]:
import inspect
print(inspect.signature(m.predict))


(business_name: str, text: str, topk: int = 3, max_length: int = 128) -> List[src.naics_inference.inference.Prediction]


In [11]:
print(m.predict("ACME Plumbing", "leak repair drain cleaning water heater", topk=3))


input_ids shape: torch.Size([1, 128])
mask tail: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
logits shape: (5,) top_idx: [2, 1, 0]
[Prediction(code='238220', prob=0.32129210233688354), Prediction(code='238210', prob=0.18525731563568115), Prediction(code='238160', prob=0.17992505431175232)]
