# Dataset

In [1]:
import os
currentPath = os.getcwd().replace('\\','/')    # 获取当前路径

print(currentPath)

/home/Arthur/aicup


In [3]:
from datasets import load_dataset, Features, Value

dataset = load_dataset("csv", data_files="PublicDataset_phase3/merged_ALL_20231130.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

In [4]:
from datasets import load_dataset, Features, Value

valid_dataset = load_dataset("csv", data_files="/home/Arthur/aicup/PublicDataset_phase3/merged_Validation_1130.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['fid', 'idx', 'content', 'label'],
        num_rows: 114014
    })
})

For demonstration purpose, we only use the randomly sampled 20000 instances.

# Data loader

In [None]:
!pip install islab-opendeid

# Model

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig

plm = "EleutherAI/pythia-1.4b-deduped"
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(plm)

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)


model = AutoModelForCausalLM.from_pretrained(plm, config = config, quantization_config=bnb_config, device_map={"":0})


<|pad|>: 50278


In [4]:
print(model)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=2048, out_features=6144, bias=True)
          (dense): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear8bitLt(in_features=8192, out_feat

In [6]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="TOKEN_CLS"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 12582912 || all params: 1427230720 || trainable%: 0.8816312473991591


In [9]:
print_trainable_parameters(model)

trainable params: 12582912 || all params: 1427230720 || trainable%: 0.8816312473991591


In [10]:
print(model)

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
  

In [10]:
from peft import PeftModel

#### 載入模型用

In [6]:
from peft import PeftModel
model = PeftModel.from_pretrained(model, "/home/Arthur/aicup/quant_model")

In [14]:
model = model.merge_and_unload()



---

In [11]:
def merge_columns(example):
    template = "<|endoftext|> __CONTENT__\n\n####\n\n__LABEL__ <|END|>"
    IGNORED_PAD_IDX = -100
    texts = template.replace("__LABEL__", example['label']).replace("__CONTENT__", example['content'])
    
    encoded_seq = tokenizer(texts, padding="max_length", max_length=100, truncation=True, return_tensors="pt")
    encoded_seq["input_ids"] = encoded_seq["input_ids"][0]
    encoded_seq["attention_mask"] = encoded_seq["attention_mask"][0]
    encoded_label = torch.tensor(encoded_seq['input_ids'])
    encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX
    
    
    encoded_seq["labels"] = encoded_label.tolist()
    
    return encoded_seq

data = dataset['train'].map(merge_columns)
data_valid = valid_dataset['train'].map(merge_columns)

  encoded_label = torch.tensor(encoded_seq['input_ids'])
Map: 100%|██████████| 113169/113169 [00:15<00:00, 7358.28 examples/s]
Map: 100%|██████████| 25739/25739 [00:03<00:00, 6844.61 examples/s]


In [30]:
#tokenizer.padding_side = 'left'

results = tokenizer(["Lab No: 14H02780", "“STOCKDALE” 653 MONAGHAN RD"], padding=True)
print(results['input_ids'])
print()

[[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438], [1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]]



In [12]:
tmp_data = data.remove_columns(["fid","idx","content","label"])
print(tmp_data["labels"][0])

[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 0, 33590, 1621, 27, 50276, 2693, 39, 520, 2082, 2504, 43, 50279, 1838, 20872, 27, 15630, 39, 520, 2082, 2504, 43, 209, 50277]


In [13]:
tmp_data_valid = data_valid.remove_columns(["fid","idx","content","label"])
print(tmp_data_valid["labels"][0])

[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 0, 33590, 1621, 27, 50276, 2055, 58, 18040, 18040, 45, 50279, 1838, 20872, 27, 2055, 58, 18040, 18040, 45, 209, 50277]


In [17]:
print(len(tmp_data['labels'][0]))

100


In [25]:
model.train()

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
  

In [26]:
import transformers

# needed for gpt-neo-x tokenizer
#tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tmp_data,
    eval_dataset=tmp_data_valid,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        # gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=300,
        #num_train_epochs=8,
        evaluation_strategy="steps",
        eval_steps=100,
        eval_accumulation_steps=4,
        lr_scheduler_type="cosine",
        learning_rate=2e-5,
        fp16=True,
        logging_steps=10,
        logging_dir='./logs',
        output_dir="outputs",
        optim="paged_adamw_8bit",
        #load_best_model_at_end=True,
        save_strategy="steps",
        save_total_limit=2
    ),
    #data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss,Validation Loss
100,1.5596,1.715227
200,1.5197,1.667066
300,1.5681,1.664412


TrainOutput(global_step=300, training_loss=1.5479183260599771, metrics={'train_runtime': 471.4175, 'train_samples_per_second': 10.182, 'train_steps_per_second': 0.636, 'total_flos': 3813719408640000.0, 'train_loss': 1.5479183260599771, 'epoch': 0.04})

In [16]:
best_ckpt_path = trainer.state_best_model_checkpoint

AttributeError: 'Trainer' object has no attribute 'state_best_model_checkpoint'

### 先別用

In [27]:
model_name = "quant_model_1130"
model_dir = f"/home/Arthur/aicup/models/{model_name}"
#torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))
#torch.save(model, os.path.join(model_dir , 'GPT_best_model.pt'))
model.save_pretrained("quant_model_1130")
tokenizer.save_pretrained("quant_model_1130")

('quant_model_1130/tokenizer_config.json',
 'quant_model_1130/special_tokens_map.json',
 'quant_model_1130/tokenizer.json')

: 

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
#model.load_state_dict(torch.load("/home/Lewis/Presentation/test/outputs/checkpoint-2000/optimizer.pt"))
#model = torch.load("/home/Lewis/Presentation/test/checkpoint-6500/optimizer.pt")
model.eval()

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
  

---

In [20]:
print(model)

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
  

In [21]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="PublicDataset_phase3/opendid_valid.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'labels': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'labels'])
valid_list= list(valid_data['train'])
valid_list

[{'fid': '1001',
  'idx': 0,
  'content': 'Episode No:  88Y206206L',
  'labels': None},
 {'fid': '1001', 'idx': 24, 'content': '8892062.BPL', 'labels': None},
 {'fid': '1001',
  'idx': 37,
  'content': 'Vatterott, Jerrie CLARENCE',
  'labels': None},
 {'fid': '1001',
  'idx': 65,
  'content': 'Lab No:  88Y20620,88Y20620',
  'labels': None},
 {'fid': '1001', 'idx': 92, 'content': 'Exeter', 'labels': None},
 {'fid': '1001',
  'idx': 99,
  'content': 'DECEPTION BAY  Northern Territory  6845',
  'labels': None},
 {'fid': '1001',
  'idx': 139,
  'content': 'Specimen: Fluid,Tissue',
  'labels': None},
 {'fid': '1001', 'idx': 162, 'content': 'D.O.B:  15/11/2004', 'labels': None},
 {'fid': '1001', 'idx': 181, 'content': 'Sex:  F', 'labels': None},
 {'fid': '1001',
  'idx': 189,
  'content': 'Collected: 20/5/2064 at :',
  'labels': None},
 {'fid': '1001',
  'idx': 215,
  'content': 'Location:  PARKES 8 - GUNNEDAH DISTRICT HOSPITAL',
  'labels': None},
 {'fid': '1001',
  'idx': 264,
  'content':

In [18]:
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 2560)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2560, out_features=7680, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
             

In [22]:
from tqdm import tqdm
from islab.aicup import aicup_predict
import io
BATCH_SIZE = 16

with open("./answer/answer_1130.txt",'w',encoding='utf8') as f:
#with io.open("answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

100%|██████████| 2773/2773 [1:13:17<00:00,  1.59s/it]  


In [11]:
inputs = tokenizer("Select the privacy information for me",return_tensors="pt")

In [14]:
inputs = tokenizer("D.O.B:  15/11/2004",return_tensors="pt")


In [15]:
output_ids = model.generate(**inputs)

In [16]:
tokenizer.batch_decode(output_ids)

['D.O.B:  15/11/200415-DATEDATEDATEDATEDATEDATE']

In [57]:
tokenizer.batch_decode([[50277, 50277, 50277, 50277, 50277, 50277, 21663, 1621, 27, 1638, 41, 16604, 1438], [1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]]
)

['<|END|><|END|><|END|><|END|><|END|><|END|>Lab No: 14H02780',
 '“STOCKDALE” 653 MONAGHAN RD']

In [43]:
inputs = tokenizer(valid_list[2]["content"],return_tensors="pt")

In [44]:
tokenizer(valid_list[2]["content"])

{'input_ids': [55, 2569, 1519, 13, 5633, 6595, 8951, 1277, 17740], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}