In [35]:
import gensim
import pandas as pd
import gensim.downloader
import src.utils as utils
from nltk.corpus import stopwords
import numpy as np

stop_words = set(stopwords.words("english"))

from pymorphy3 import MorphAnalyzer

analyzer = MorphAnalyzer()
from sklearn.feature_extraction.text import CountVectorizer
from src.transform import Normalizer, LengthScaler
from sklearn.pipeline import Pipeline

import faiss

%load_ext autoreload
%autoreload 2

In [2]:
# pip install faiss-cpu

In [3]:
def build_flat(dim, build_data):
    index = faiss.IndexIVFFlat(
        faiss.IndexFlatL2(dim), dim, 1024, faiss.METRIC_INNER_PRODUCT
    )
    index.train(build_data)
    index.add(build_data)

    return index


def search_flat(index, query_data, k):
    distances, labels = index.search(query_data, k)
    return distances, labels

In [4]:
data = pd.read_json("data/jailbreak.json")
normalizer = Normalizer()
data["jailbreak"] = normalizer.transform(data.jailbreak)

In [5]:
char_vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 2))

char_vectorizer.fit_transform(data.jailbreak).toarray().shape

(104, 494)

In [53]:
# w2v_model = gensim.downloader.load('glove-wiki-gigaword-50')
# utils.save_model_compressed(w2v_model, "w2v_model", 9)

w2v_model = utils.load_model("w2v_model")

In [7]:
corpus = w2v_model.key_to_index.keys()

In [8]:
import string

etalon = set(string.ascii_lowercase)


def in_english(line):
    return np.all(np.vectorize(lambda x: x in etalon)(list(line)))


corpus = [x for x in corpus if (in_english(x) and x not in stop_words)]

In [9]:
len(corpus)

317607

In [10]:
lscaler = LengthScaler()

pipe = Pipeline([("vectorizer", char_vectorizer), ("lscaler", lscaler)])

X = pipe.transform(corpus)

In [11]:
dog = pipe.transform(["dog", "transformer"])

dog.shape

(2, 494)

In [12]:
index = build_flat(494, X)

In [49]:
utils.save_model_compressed(index, "index.pt", 9)

In [13]:
utils.save_model(pipe, "char_grams.pt")
utils.save_model(corpus, "corpus.pt")

In [44]:
from Levenshtein import distance
from typing import Sequence


class CorpusSearcher:
    def __init__(self):
        self.corpus = np.array(utils.load_model("corpus.pt"))
        self.vectorizer = utils.load_model("char_grams.pt")
        self.index = utils.load_model("index2.pt")

    def _find_k_neib(self, data: Sequence[str], k: int = 25) -> Sequence[Sequence[int]]:
        data = self.vectorizer.transform(data)
        return self.index.search(data, k)[1]

    def _best_candidate(self, x, candidates: Sequence[str]):
        distances = np.vectorize(lambda y: distance(x, y))(candidates)
        idx = np.argmin(distances)
        return candidates[idx]

    def find(self, data: Sequence[str]):
        data = np.array(data)
        indexes = self._find_k_neib(data)
        result = []
        for x, idx in zip(data, indexes):
            candidates = self.corpus[idx]
            result.append(self._best_candidate(x, candidates))
        return result

In [16]:
distance("aaa", "bba")

2

In [50]:
from src.searcher import CorpusSearcher

In [51]:
searcher = CorpusSearcher()

In [52]:
searcher.find(["transfrm", "dogy"])

['transform', 'doggy']

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte