In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=ec3a0e98db21d94872bc3dd15bf74dc0901df5c31cb61fb8c197a18c193e5060
  Stored in directory: 

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.109.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.26.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.5/60.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.3.1-py2.

In [1]:
from sentence_transformers import SentenceTransformer
import chromadb

class EmbeddingFunction:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    def __call__(self, input):
        return self.model.encode(input).tolist()


class DB:

    def __init__(self, distance_function, root_path):
        self.ef = EmbeddingFunction()
        self.client = chromadb.PersistentClient(path=root_path)
        self.distance_function = distance_function
        assert distance_function in ["l2", "ip", "cosine"], "Distance function should be 'l2' or 'ip' or 'cosine'"
        self.collection = self.client.get_or_create_collection("lab5_" + self.distance_function,
                                                               metadata={"hnsw:space": self.distance_function},
                                                               embedding_function=self.ef)

    def add(self, items):
        old_batch = 0
        new_batch = 1000
        while True:
            if new_batch > len(items["fragments"]):
                break
            self.collection.add(
                documents=items["fragments"][old_batch:new_batch],
                metadatas=items["metadata"][old_batch:new_batch],
                ids=items["ids"][old_batch:new_batch])
            old_batch = new_batch
            new_batch += 1000
        self.collection.add(
            documents=items["fragments"][old_batch:],
            metadatas=items["metadata"][old_batch:],
            ids=items["ids"][old_batch:])

    def query(self, query, n_results):
        return self.collection.query(query_embeddings=self.ef(query), n_results=n_results)

    def clear(self):
        self.client.delete_collection("lab5_" + self.distance_function)
        self.collection = self.client.get_or_create_collection("lab5_" + self.distance_function,
                                                               metadata={"hnsw:space": self.distance_function},
                                                               embedding_function=self.ef)

In [2]:
import re
import os

def split_to_sent(text):
    sentences = re.split(
        r"(((?<!\w\.\w.)(?<!\s\w\.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s(?=[A-Z]))|((?<![\,\-\:])\n(?=[A-Z]|\" )))", text)[::4]
    return sentences

In [3]:
def split_document(lines, fragment_limit=100):
    sentences = split_to_sent(lines)
    result = []
    fragment = ""
    length = 0
    for s in sentences:
        fragment += s + " "
        length += len(s.split(" "))
        if length > fragment_limit:
            result.append(fragment)
            fragment = ""
            length = 0
    return result

In [4]:
import pandas as pd

def split_dataset(dataset_path, fragment_limit=100):
    result_fragments = []
    metadata = []
    result_ids = []
    t = "train.csv"
    filepath = os.path.join(dataset_path, t)
    df = pd.read_csv(filepath, names=['label', 'Title', 'Description'])
    df['text'] = (df['Title'] + '. ' + df['Description'])
    df.drop(columns=['Title', 'Description'], axis=1, inplace=True)
    for index, row in df.iterrows():
      fragments_raw = split_document(row['text'], fragment_limit)
      counter = 0
      for fragment in fragments_raw:
          result_fragments.append(fragment.replace("\n", " "))
          metadata.append({"document": index, "topic": row["label"]})
          result_ids.append(f"{index}_{row['label']}_{counter}")
          counter += 1
    return result_fragments, result_ids, metadata

In [9]:
data_dir = os.path.realpath("./dataset/raw")

In [10]:
fragments, ids, metadata = split_dataset(data_dir, fragment_limit=20)

In [11]:
database_l2 = DB("l2", "./dataset/raw/DB")
database_ip = DB("ip", "./dataset/raw/DB")
database_cosine = DB("cosine", "./dataset/raw/DB")

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
database_l2.clear()
database_l2.add({"fragments": fragments, "metadata": metadata, "ids": ids})

In [None]:
database_ip.clear()
database_ip.add({"fragments": fragments, "metadata": metadata, "ids": ids})

In [None]:
database_cosine.clear()
database_cosine.add({"fragments": fragments, "metadata": metadata, "ids": ids})

In [None]:
database_l2.query("What Iraq problem is?", 5)

{'ids': [['6349_1_0', '32638_1_0', '60919_3_0', '96503_1_0', '32887_1_0']],
 'distances': [[6.023730278015137,
   6.300642967224121,
   6.8550286293029785,
   7.1842851638793945,
   7.284687042236328]],
 'metadatas': [[{'document': 6349, 'topic': 1},
   {'document': 32638, 'topic': 1},
   {'document': 60919, 'topic': 3},
   {'document': 96503, 'topic': 1},
   {'document': 32887, 'topic': 1}]],
 'embeddings': None,
 'documents': [['Cleaning up Washington #39;s  #39;Iraqi problem #39;: invest authority in &lt;b&gt;...&lt;/b&gt;. Is there a  quot;Sunni problem quot; in Iraq, as the United States would like us to believe? ',
   'Also from this section. News reports on Iraq paint a picture of a country trapped in a cycle of violence and destruction. ',
   'From Economic Power to Economic Powder. JEDDAH, 6 December 2004 - The current situation with Iraq has all the ingredients of a long-term liability to the balance sheet of America, which some day it might be a disaster to the state of worl

In [None]:
database_ip.query("What Iraq problem is?", 5)

{'ids': [['32638_1_0', '6349_1_0', '60919_3_0', '24894_1_0', '52166_1_0']],
 'distances': [[-5.541466236114502,
   -4.811572074890137,
   -4.536117076873779,
   -4.471153259277344,
   -4.452374458312988]],
 'metadatas': [[{'document': 32638, 'topic': 1},
   {'document': 6349, 'topic': 1},
   {'document': 60919, 'topic': 3},
   {'document': 24894, 'topic': 1},
   {'document': 52166, 'topic': 1}]],
 'embeddings': None,
 'documents': [['Also from this section. News reports on Iraq paint a picture of a country trapped in a cycle of violence and destruction. ',
   'Cleaning up Washington #39;s  #39;Iraqi problem #39;: invest authority in &lt;b&gt;...&lt;/b&gt;. Is there a  quot;Sunni problem quot; in Iraq, as the United States would like us to believe? ',
   'From Economic Power to Economic Powder. JEDDAH, 6 December 2004 - The current situation with Iraq has all the ingredients of a long-term liability to the balance sheet of America, which some day it might be a disaster to the state of w

In [None]:
database_cosine.query("What Iraq problem is?", 5)

{'ids': [['32638_1_0', '6349_1_0', '60919_3_0', '96503_1_0', '32887_1_0']],
 'distances': [[0.32206976413726807,
   0.3275832533836365,
   0.37182891368865967,
   0.3911219835281372,
   0.39623111486434937]],
 'metadatas': [[{'document': 32638, 'topic': 1},
   {'document': 6349, 'topic': 1},
   {'document': 60919, 'topic': 3},
   {'document': 96503, 'topic': 1},
   {'document': 32887, 'topic': 1}]],
 'embeddings': None,
 'documents': [['Also from this section. News reports on Iraq paint a picture of a country trapped in a cycle of violence and destruction. ',
   'Cleaning up Washington #39;s  #39;Iraqi problem #39;: invest authority in &lt;b&gt;...&lt;/b&gt;. Is there a  quot;Sunni problem quot; in Iraq, as the United States would like us to believe? ',
   'From Economic Power to Economic Powder. JEDDAH, 6 December 2004 - The current situation with Iraq has all the ingredients of a long-term liability to the balance sheet of America, which some day it might be a disaster to the state o

Работа мульти-языковой модели

In [None]:
database_l2.query("В чем проблема Ирака?", 5)

{'ids': [['6349_1_0', '32638_1_0', '60919_3_0', '36322_1_0', '36975_1_0']],
 'distances': [[4.579774856567383,
   5.624168872833252,
   5.922056198120117,
   6.118265151977539,
   6.1514811515808105]],
 'metadatas': [[{'document': 6349, 'topic': 1},
   {'document': 32638, 'topic': 1},
   {'document': 60919, 'topic': 3},
   {'document': 36322, 'topic': 1},
   {'document': 36975, 'topic': 1}]],
 'embeddings': None,
 'documents': [['Cleaning up Washington #39;s  #39;Iraqi problem #39;: invest authority in &lt;b&gt;...&lt;/b&gt;. Is there a  quot;Sunni problem quot; in Iraq, as the United States would like us to believe? ',
   'Also from this section. News reports on Iraq paint a picture of a country trapped in a cycle of violence and destruction. ',
   'From Economic Power to Economic Powder. JEDDAH, 6 December 2004 - The current situation with Iraq has all the ingredients of a long-term liability to the balance sheet of America, which some day it might be a disaster to the state of world