In [30]:
from src.utils import load_model
from src.corpus_keeper import CorpusBuilder, CorpusSearcher
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from src.transform import Normalizer, LengthScaler, ToArray
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearnex import patch_sklearn

patch_sklearn()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [56]:
import re
from sklearn.model_selection import train_test_split

data = pd.read_csv("data/prompts.csv")

data["prompts"] = data.prompts.apply(lambda x: re.sub(r"(\[.*?\]|\{.*?\})", " ", x))

train, test = train_test_split(data, test_size=0.2)

In [57]:
import catboost as ct 

In [58]:
model = ct.CatBoostClassifier()

In [59]:
pipe = Pipeline(
    [
        ("normalizer", Normalizer(use_stemming=False, remove_stopwords=False)),
        ("vectorizer", CountVectorizer(min_df=2, binary=True, ngram_range=(1, 2))),
        ("scaler_1", LengthScaler()),
        ("scaler_2", StandardScaler()),
        ("model", model),
    ]
)

In [60]:
pipe.fit(train.prompts, train.jailbreak)

Learning rate set to 0.010583
0:	learn: 0.6838290	total: 143ms	remaining: 2m 22s
1:	learn: 0.6730418	total: 272ms	remaining: 2m 15s
2:	learn: 0.6656690	total: 393ms	remaining: 2m 10s
3:	learn: 0.6551524	total: 516ms	remaining: 2m 8s
4:	learn: 0.6456578	total: 643ms	remaining: 2m 7s
5:	learn: 0.6379847	total: 760ms	remaining: 2m 5s
6:	learn: 0.6291935	total: 880ms	remaining: 2m 4s
7:	learn: 0.6219681	total: 1s	remaining: 2m 4s
8:	learn: 0.6137411	total: 1.13s	remaining: 2m 3s
9:	learn: 0.6073129	total: 1.25s	remaining: 2m 3s
10:	learn: 0.6001633	total: 1.37s	remaining: 2m 2s
11:	learn: 0.5911424	total: 1.49s	remaining: 2m 2s
12:	learn: 0.5836826	total: 1.6s	remaining: 2m 1s
13:	learn: 0.5778708	total: 1.73s	remaining: 2m 1s
14:	learn: 0.5702765	total: 1.84s	remaining: 2m 1s
15:	learn: 0.5643492	total: 1.96s	remaining: 2m
16:	learn: 0.5586972	total: 2.08s	remaining: 2m
17:	learn: 0.5538073	total: 2.2s	remaining: 1m 59s
18:	learn: 0.5471353	total: 2.32s	remaining: 1m 59s
19:	learn: 0.5408

In [61]:
preds = pipe.predict(test.prompts)

In [62]:
from sklearn.metrics import f1_score
f1_score(preds, test.jailbreak)

0.9576271186440678

In [63]:
from src import utils

In [64]:
utils.save_model(pipe, "catboost_model.pt")