In [1]:
!pip install sentence_transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install chromadb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from sentence_transformers import SentenceTransformer
import chromadb


class EmbeddingFunction:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    def __call__(self, input):
        return self.model.encode(input).tolist()


class DB:

    def __init__(self, distance_function, root_path):
        self.ef = EmbeddingFunction()
        self.client = chromadb.PersistentClient(path=root_path)
        self.distance_function = distance_function
        assert distance_function in ["l2", "ip", "cosine"], "Distance function should be 'l2' or 'ip' or 'cosine'"
        self.collection = self.client.get_or_create_collection("lab5_" + self.distance_function,
                                                               metadata={"hnsw:space": self.distance_function},
                                                               embedding_function=self.ef)

    def add(self, items):
        old_batch = 0
        new_batch = 1000
        while True:
            if new_batch > len(items["fragments"]):
                break
            self.collection.add(
                documents=items["fragments"][old_batch:new_batch],
                metadatas=items["metadata"][old_batch:new_batch],
                ids=items["ids"][old_batch:new_batch])
            old_batch = new_batch
            new_batch += 1000
        self.collection.add(
            documents=items["fragments"][old_batch:],
            metadatas=items["metadata"][old_batch:],
            ids=items["ids"][old_batch:])

    def query(self, query, n_results):
        return self.collection.query(query_embeddings=self.ef(query), n_results=n_results)

    def clear(self):
        self.client.delete_collection("lab5_" + self.distance_function)
        self.collection = self.client.get_or_create_collection("lab5_" + self.distance_function,
                                                               metadata={"hnsw:space": self.distance_function},
                                                               embedding_function=self.ef)

In [4]:
from common.utils import split_to_sentences

def split_document(lines, fragment_limit=100):
    sentences = split_to_sentences(lines)
    result = []
    fragment = ""
    length = 0
    for s in sentences:
        fragment += s + " "
        length += len(s.split(" "))
        if length > fragment_limit:
            result.append(fragment)
            fragment = ""
            length = 0
    return result

In [5]:
import os
import pandas as pd

def split_dataset(dataset_path, fragment_limit=100):
    result_fragments = []
    metadata = []
    result_ids = []
    t = "train.csv"
    filepath = os.path.join(dataset_path, t)
    df = pd.read_csv(filepath, names=['label', 'Title', 'Description'])
    df['text'] = (df['Title'] + '. ' + df['Description'])
    df.drop(columns=['Title', 'Description'], axis=1, inplace=True)
    for index, row in df.iterrows():
      fragments_raw = split_document(row['text'], fragment_limit)
      counter = 0
      for fragment in fragments_raw:
          result_fragments.append(fragment.replace("\n", " "))
          metadata.append({"document": index, "topic": row["label"]})
          result_ids.append(f"{index}_{row['label']}_{counter}")
          counter += 1
    return result_fragments, result_ids, metadata

In [8]:
data_dir = os.path.realpath("../dataset/raw")

In [9]:
fragments, ids, metadata = split_dataset(data_dir, fragment_limit=20)

In [14]:
database_l2 = DB("l2", "./db")
database_ip = DB("ip", "./db")
database_cosine = DB("cosine", "./db")

In [15]:
database_l2.clear()
database_l2.add({"fragments": fragments, "metadata": metadata, "ids": ids})

In [16]:
database_ip.clear()
database_ip.add({"fragments": fragments, "metadata": metadata, "ids": ids})

In [17]:
database_cosine.clear()
database_cosine.add({"fragments": fragments, "metadata": metadata, "ids": ids})

In [24]:
database_l2.query("Who loves Christianity?", 5)

{'ids': [['16355_3_0', '5290_4_0', '48807_1_0', '9635_1_0', '48568_1_0']],
 'distances': [[10.19619083404541,
   11.246757507324219,
   11.313432693481445,
   11.380290985107422,
   11.522608757019043]],
 'metadatas': [[{'document': 16355, 'topic': 3},
   {'document': 5290, 'topic': 4},
   {'document': 48807, 'topic': 1},
   {'document': 9635, 'topic': 1},
   {'document': 48568, 'topic': 1}]],
 'embeddings': None,
 'documents': [["Holy DVD!. &lt;i&gt;The Passion of the Christ&lt;/i&gt; is a hot DVD, even if the film studios were cast as doubting Thomas' understudies. ",
   "Constantine and the rise of Christianity. The history of how Christianity became an accepted mainstream religion is an interesting one. If you have never heard the story of the Roman Emperor Constantine and his effect on the world's current religious landscape, read on to learn how one mans actions during his rise to power changed the world forever.       ",
   'Pope puts 5 faithful on path to sainthood. VATICAN CIT

In [25]:
database_ip.query("Who loves Christianity?", 5)

{'ids': [['63915_1_0', '13511_1_0', '16355_3_0', '49570_1_0', '9635_1_0']],
 'distances': [[-2.834643840789795,
   -2.7705090045928955,
   -2.7313406467437744,
   -2.6550183296203613,
   -2.5799100399017334]],
 'metadatas': [[{'document': 63915, 'topic': 1},
   {'document': 13511, 'topic': 1},
   {'document': 16355, 'topic': 3},
   {'document': 49570, 'topic': 1},
   {'document': 9635, 'topic': 1}]],
 'embeddings': None,
 'documents': [['Anglican leaders welcome report. A report criticising US Anglicans for ordaining a gay bishop is welcomed by those who opposed his appointment. ',
   'Moscow faithful hail return of cherished icon. The religious faithful in Moscow are flocking to a Kremlin cathedral for a homecoming of sorts -- the return of a beloved icon, or holy picture. ',
   "Holy DVD!. &lt;i&gt;The Passion of the Christ&lt;/i&gt; is a hot DVD, even if the film studios were cast as doubting Thomas' understudies. ",
   'The passion of the beatification. The 19th century mystic nun 

In [26]:
database_cosine.query("Who loves Christianity?", 5)

{'ids': [['16355_3_0', '63915_1_0', '9635_1_0', '13511_1_0', '48222_1_0']],
 'distances': [[0.5703648924827576,
   0.6005172729492188,
   0.610811710357666,
   0.6144230365753174,
   0.6151124238967896]],
 'metadatas': [[{'document': 16355, 'topic': 3},
   {'document': 63915, 'topic': 1},
   {'document': 9635, 'topic': 1},
   {'document': 13511, 'topic': 1},
   {'document': 48222, 'topic': 1}]],
 'embeddings': None,
 'documents': [["Holy DVD!. &lt;i&gt;The Passion of the Christ&lt;/i&gt; is a hot DVD, even if the film studios were cast as doubting Thomas' understudies. ",
   'Anglican leaders welcome report. A report criticising US Anglicans for ordaining a gay bishop is welcomed by those who opposed his appointment. ',
   "Churches 'snap up Passion DVDs'. Makers of The Passion of the Christ are tempting US churches with bulk orders, says  The New York Times. ",
   'Moscow faithful hail return of cherished icon. The religious faithful in Moscow are flocking to a Kremlin cathedral for a

In [27]:
database_l2.query("Кто любит христианство?", 5)

{'ids': [['16355_3_0', '5290_4_0', '48807_1_0', '9635_1_0', '63915_1_0']],
 'distances': [[8.51008415222168,
   9.104658126831055,
   9.421624183654785,
   9.546403884887695,
   9.55567455291748]],
 'metadatas': [[{'document': 16355, 'topic': 3},
   {'document': 5290, 'topic': 4},
   {'document': 48807, 'topic': 1},
   {'document': 9635, 'topic': 1},
   {'document': 63915, 'topic': 1}]],
 'embeddings': None,
 'documents': [["Holy DVD!. &lt;i&gt;The Passion of the Christ&lt;/i&gt; is a hot DVD, even if the film studios were cast as doubting Thomas' understudies. ",
   "Constantine and the rise of Christianity. The history of how Christianity became an accepted mainstream religion is an interesting one. If you have never heard the story of the Roman Emperor Constantine and his effect on the world's current religious landscape, read on to learn how one mans actions during his rise to power changed the world forever.       ",
   'Pope puts 5 faithful on path to sainthood. VATICAN CITYPope 

In [28]:
database_ip.query("Кто любит христианство?", 5)

{'ids': [['63915_1_0', '13511_1_0', '16355_3_0', '48222_1_0', '49570_1_0']],
 'distances': [[-2.661653518676758,
   -2.3969850540161133,
   -2.3824825286865234,
   -2.351529359817505,
   -2.3380744457244873]],
 'metadatas': [[{'document': 63915, 'topic': 1},
   {'document': 13511, 'topic': 1},
   {'document': 16355, 'topic': 3},
   {'document': 48222, 'topic': 1},
   {'document': 49570, 'topic': 1}]],
 'embeddings': None,
 'documents': [['Anglican leaders welcome report. A report criticising US Anglicans for ordaining a gay bishop is welcomed by those who opposed his appointment. ',
   'Moscow faithful hail return of cherished icon. The religious faithful in Moscow are flocking to a Kremlin cathedral for a homecoming of sorts -- the return of a beloved icon, or holy picture. ',
   "Holy DVD!. &lt;i&gt;The Passion of the Christ&lt;/i&gt; is a hot DVD, even if the film studios were cast as doubting Thomas' understudies. ",
   'Sermon urges unity for island.  quot;BE not dismayed, quot; s

In [29]:
database_cosine.query("Кто любит христианство?", 5)

{'ids': [['16355_3_0', '63915_1_0', '48222_1_0', '9635_1_0', '13511_1_0']],
 'distances': [[0.5565173625946045,
   0.5656334161758423,
   0.5740344524383545,
   0.5908731818199158,
   0.6044411063194275]],
 'metadatas': [[{'document': 16355, 'topic': 3},
   {'document': 63915, 'topic': 1},
   {'document': 48222, 'topic': 1},
   {'document': 9635, 'topic': 1},
   {'document': 13511, 'topic': 1}]],
 'embeddings': None,
 'documents': [["Holy DVD!. &lt;i&gt;The Passion of the Christ&lt;/i&gt; is a hot DVD, even if the film studios were cast as doubting Thomas' understudies. ",
   'Anglican leaders welcome report. A report criticising US Anglicans for ordaining a gay bishop is welcomed by those who opposed his appointment. ',
   'Sermon urges unity for island.  quot;BE not dismayed, quot; sang the Pitcairn Islanders, all wearing their best clothes, in a white, wooden church high above Bounty Bay. ',
   "Churches 'snap up Passion DVDs'. Makers of The Passion of the Christ are tempting US chu