In [None]:
import warnings

import pandas as pd
import torch
import wandb
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer
)

warnings.filterwarnings("ignore")

wandb.init(mode="disabled")

tqdm.pandas()

In [None]:
# df = pd.read_parquet("../dataset/207.masterdata.parquet")
df = pd.read_csv("../dataset/item_account_classified_group_207_new.csv", on_bad_lines='skip')

In [None]:
len(df)

In [None]:
# df = df.sample(n=1000).reset_index(drop=True)
# df

In [None]:
# load model
peft_model_id = '../outputs/llm/mistral/checkpoint-67390'

model = AutoModelForSequenceClassification.from_pretrained(peft_model_id, num_labels=3)

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cls_label = {0: 'beverage', 1: 'food', 2: 'other'}
batch_size = 16

model.eval()
model.to(device)

In [None]:
for i in tqdm(range(0, len(df), batch_size), total=len(df) // batch_size):
    batch = df['item_name'][i:i + batch_size].tolist()
    inputs = tokenizer(batch, padding=True, return_tensors='pt')
    tokenized_inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**tokenized_inputs)
    logits = outputs.logits
    probs = logits.softmax(dim=-1)
    probs = probs.cpu().numpy().argmax(axis=1)
    df.loc[i:i + batch_size - 1, 'mistral'] = [cls_label[p] for p in probs]

In [None]:
# accuracy = df[df['prediction'] == df['status']].shape[0] / df.shape[0]
# accuracy

In [None]:
df.to_excel("../dataset/item_account_classified_group_207_new_mistral.xlsx", index=False)