# Load training set and train paragraph vectors
Note: the paragraph vector model has been trained and is downloaded in the `prepare_feature_extraction()` function.

Retraining is therefore not needed, but optional

In [1]:
import numpy as np
import pandas as pd
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from peft import PeftModel
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login

from utils import (generate_prompt, 
                   generate_test_prompt, 
                    find_all_linear_names, 
                    predict, 
                    evaluate, 
                    load_pretrained_model, 
                    generate_train_val_data,
                    initiate_trainer,
                    initiate_base_model)

import bitsandbytes as bnb
import os
import warnings

os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

In [2]:
X_test = pd.read_parquet('raw/test_values.parquet', engine='pyarrow')
y_test = pd.read_parquet('raw/test_labels.parquet', engine='pyarrow')

X_test = X_test.assign(type=y_test.type.values)
X_test[X_test['values'].apply(lambda col: len(",".join(col)) > 512)] = np.nan
X_test.dropna()
X_test = X_test.groupby('type').sample(n=5)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'type']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["prediction"])

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
base_model_name = "meta-llama/Meta-Llama-3.1-8B"
model, tokenizer = initiate_base_model(base_model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
modules = find_all_linear_names(model)

In [6]:
train_data, eval_data = generate_train_val_data(train_sample=25, val_sample=5)

output_dir="Llama-3.1-8b-fine-tuned-model_sherlock/"

trainer = initiate_trainer(model, train_data, eval_data, tokenizer, modules, output_dir)

trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# y_pred = predict(X_test, model, tokenizer)
# evaluate(y_true, y_pred)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/1950 [00:00<?, ? examples/s]

Map:   0%|          | 0/390 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
49,0.2851,0.282022
98,0.3511,0.270969
147,0.3185,0.264865
196,0.2576,0.261324


NameError: name 'cta_types' is not defined

# Test Model

In [4]:
fine_tuned_model = "Llama-3.1-8b-fine-tuned-model_sherlock/"
model, tokenizer = load_pretrained_model(fine_tuned_model)

y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 390/390 [01:42<00:00,  3.80it/s]

Accuracy: 0.649
Accuracy for label address: 0.800
Accuracy for label affiliate: 1.000
Accuracy for label affiliation: 1.000
Accuracy for label age: 0.800
Accuracy for label album: 0.800
Accuracy for label area: 0.800
Accuracy for label artist: 0.400
Accuracy for label birth Date: 1.000
Accuracy for label birth Place: 1.000
Accuracy for label brand: 0.200
Accuracy for label capacity: 0.600
Accuracy for label category: 0.800
Accuracy for label city: 0.800
Accuracy for label class: 0.200
Accuracy for label classification: 0.000
Accuracy for label club: 0.600
Accuracy for label code: 1.000
Accuracy for label collection: 1.000
Accuracy for label command: 1.000
Accuracy for label company: 1.000
Accuracy for label component: 0.800
Accuracy for label continent: 1.000
Accuracy for label country: 0.800
Accuracy for label county: 0.800
Accuracy for label creator: 0.400
Accuracy for label credit: 0.400
Accuracy for label currency: 1.000
Accuracy for label day: 0.000
Accuracy for label depth: 0.000




In [None]:
model_dir = "Llama-3.1-8b-fine-tuned-model_sherlock-CTA/"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)