In [1]:
import csv
import os
from typing import List

import pandas as pd
import torch
from torch.distributed import destroy_process_group, init_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from transformers import AutoTokenizer

from ex_params import (
    BASELINE_MODELS,
    CHECKPOINTS_PATH,
    DATASETS_PATH,
    MODEL_PATH,
    PAD_TOKENS,
    PREDICTIONS_PATH,
    SEED,
    TRAINING_HISTORY_PATH,
)
from ex_utils import TextDataset, collate_fn, collate_fn_longest, evaluate_test
from models import BaselineClassifier, FineTuneClassifier, FineTuneClassifierPhi


from torch.utils.data import Sampler

torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
torch.set_float32_matmul_precision("high")

In [2]:
def path2model(path: str):
    if "baseline" in path:
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
        tokenizer.pad_token = "<|finetune_right_pad_id|>"
        tokenizer.padding_side = "left"

        model_size = path.split("_")[1]
        model_config = BASELINE_MODELS[model_size]
        model = BaselineClassifier(
            d_model=model_config["d_model"],
            num_layers=model_config["num_layers"],
            nhead=model_config["num_heads"],
            max_seq_length=model_config["max_len"],
            vocab_size=len(tokenizer),
            pad_token_id=tokenizer.pad_token_id,
            num_labels=1,
        )
        state_dict = torch.load(path, map_location="cpu")

        new_state_dict = {}
        for k, v in state_dict.items():
            new_k = k.replace("_orig_mod.", "") if k.startswith("_orig_mod.") else k
            new_state_dict[new_k] = v

        model.load_state_dict(new_state_dict)

    elif "finetune" in path:
        base_model = path.split("_")[2]
        folder = base_model2folder(base_model)
        base_model_path = os.path.join(MODEL_PATH, folder, base_model)

        tokenizer = AutoTokenizer.from_pretrained(
            base_model_path, trust_remote_code=True
        )
        if base_model in PAD_TOKENS.keys():
            tokenizer.pad_token = PAD_TOKENS[base_model]
        tokenizer.padding_side = "left"

        if "phi" in path.lower():
            model = FineTuneClassifierPhi.from_classifier_head(
                base_model_path=base_model_path,
                path=path,
                num_labels=1,
            )
        else:
            model = FineTuneClassifier.from_classifier_head(
                base_model_path=base_model_path,
                path=path,
                num_labels=1,
            )
    else:
        raise ValueError("Unknown model type")

    return model, tokenizer

In [3]:
path  = "../../../checkpoints/baseline/baseline_mini_master-large.pt"

In [4]:
model, tokenizer = path2model(path)

  state_dict = torch.load(path, map_location="cpu")


In [26]:
from tqdm import tqdm
device = "cuda"
model.to(device)

BaselineClassifier(
  (token_embedding): Embedding(128256, 324, padding_idx=128004)
  (pos_embedding): Embedding(8192, 324)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=324, out_features=324, bias=True)
        )
        (linear1): Linear(in_features=324, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=324, bias=True)
        (norm1): LayerNorm((324,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((324,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=648, out_features=1, bias=True)
)

In [56]:
texts = ["""We sat at a small café by the gate, talking about movies, travel, and life like old friends. As he boarded his flight, he turned back and said, “Stay inspired — maybe one day I’ll be in your movie.”"""]
labels = [0]

In [57]:
ds = TextDataset(texts, labels)

In [58]:
loader = DataLoader(
    ds,
    batch_size=1,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, tokenizer),
)

In [59]:
with torch.no_grad():
    for batch in tqdm(loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model(input_ids)

            mask = labels.view(-1) != -100
            labels = labels.view(-1)[mask].float()
            outputs = outputs.view(-1)[mask]


        logits = torch.sigmoid(outputs).squeeze().float().cpu().numpy()
        labels = labels.squeeze().cpu().numpy()

Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]


In [60]:
logits

array([0.65234375, 0.640625  , 0.66796875, 0.6328125 , 0.65234375,
       0.63671875, 0.65625   , 0.66796875, 0.6484375 , 0.62109375,
       0.65234375, 0.671875  , 0.68359375, 0.6484375 , 0.66796875,
       0.6484375 , 0.65625   , 0.63671875, 0.65234375, 0.671875  ,
       0.6640625 , 0.66796875, 0.6796875 , 0.6875    , 0.66015625,
       0.6640625 , 0.72265625, 0.69140625, 0.69921875, 0.66796875,
       0.6953125 , 0.69921875, 0.69921875, 0.71875   , 0.68359375,
       0.67578125, 0.69140625, 0.70703125, 0.65234375, 0.69921875,
       0.63671875, 0.69921875, 0.69921875, 0.6875    , 0.69140625,
       0.6640625 , 0.68359375, 0.66796875, 0.6796875 ], dtype=float32)