In [5]:
from pathlib import Path

base_path = Path(".").absolute().parents[0]

In [6]:
import sys

sys.path.insert(0, str(base_path / "src"))

# Import package

In [11]:
from pathlib import Path

import pandas as pd
import torch
from transformers import AutoTokenizer

from attribute_extraction.models.attribute_classification import MultiAttributeClassifier
from attribute_extraction.models.inference_utils import predict_attribute_dataset
from attribute_extraction.models.mapper import Mapper

# Setting up variables

In [8]:
attribute_code_col = "attribute_code"
attribute_lov_col = "lov_code"
context_col = "context"

pred_label_col = "predicted_labels"
pred_score_col = "predicted_scores"

training_path = "../outputs_train_workflow/"
test_set_uri = "../data/test_formatted.csv"
prediction_attribute_classification_uri = "../outputs_predict_workflow/"

model_path = f"{training_path}model.ckpt"
mapper_path = f"{training_path}mapper.json"

model_name = "distilbert-base-multilingual-cased"
lov_attribute_codes = ["02419", "01746", "00562", "15344", "99999"]
batch_size = 128 
projection_dim = 256

# Read data and pre-processing

In [9]:

data_test = pd.read_csv(test_set_uri).reset_index(drop=True)

data_test['attribute_code'] = data_test['attribute_code'].apply(lambda x: str(x).zfill(5))
data_test['lov_code'] = data_test['lov_code'].apply(lambda x: str(x).zfill(5))

data_test = data_test.fillna('')

data_test[context_col] = data_test.title + " " + data_test.description_clean

# Model Initialisation

In [12]:
mapper = Mapper.load(mapper_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = MultiAttributeClassifier(
    vocab_size=tokenizer.vocab_size,
    class_config=mapper.mappings,
    projection_dim=projection_dim,
)

model.load_state_dict(
    torch.load(
        model_path,
        map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    )["state_dict"]
)


TypeError: __init__() missing 1 required positional argument: 'model_name_or_path'

# Predictions

In [None]:
dataset_list = []

for attribute_code in lov_attribute_codes:

    attribute_data = data_test[data_test[attribute_code_col] == attribute_code].reset_index(
        drop=True
    )

    predictions = predict_attribute_dataset(
        model=model,
        tokenizer=tokenizer,
        dataframe=attribute_data,
        context_col_name=context_col,
        batch_size=batch_size,
        attribute_code_col_name=attribute_code_col,
        attribute_code=attribute_code,
    )

    attribute_data[pred_label_col] = [prediction[0]["label"] for prediction in predictions]
    attribute_data[pred_score_col] = [prediction[0]["score"] for prediction in predictions]

    attribute_data["is_top1"] = [
        int(label in [x["label"] for x in prediction[:1]])
        for label, prediction in zip(attribute_data[attribute_lov_col], predictions)
    ]
    attribute_data["is_top3"] = [
        int(label in [x["label"] for x in prediction[:3]])
        for label, prediction in zip(attribute_data[attribute_lov_col], predictions)
    ]
    attribute_data["is_top5"] = [
        int(label in [x["label"] for x in prediction])
        for label, prediction in zip(attribute_data[attribute_lov_col], predictions)
    ]

    dataset_list.append(attribute_data)


# Saving Predictions

In [None]:
prediction_data = pd.concat(dataset_list)

prediction_data.to_csv(f"{prediction_attribute_classification_uri}predictions.csv", index=False)