In [17]:
import json
import re

import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

import chromadb

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
embedding_model = "facebook/contriever"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model)
embedding_model = AutoModel.from_pretrained(embedding_model)

In [28]:
embedding_model.to(1)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [22]:
import torch

In [29]:
# Mean pooling
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

In [30]:
text = "My name is Ashwin."
inputs = embedding_tokenizer([text], padding=True, truncation=True, return_tensors="pt").to(1)
with torch.no_grad():
    outputs = embedding_model(**inputs)
embeddings = mean_pooling(outputs[0], inputs['attention_mask']).cpu().detach().numpy()
embedding_model.to("cpu")

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [34]:
embeddings.squeeze().tolist()

[0.008037235587835312,
 -0.0846322774887085,
 -0.01108681969344616,
 -0.015425469726324081,
 0.08720726519823074,
 0.012152932584285736,
 0.019691146910190582,
 -0.06804359704256058,
 0.05082300305366516,
 0.046522244811058044,
 -0.00964499544352293,
 -0.05573597550392151,
 -0.05264371633529663,
 -0.004860157147049904,
 -0.04033152014017105,
 0.01992078125476837,
 -0.06293031573295593,
 -0.08614285290241241,
 -0.04751504957675934,
 -0.03737380728125572,
 -0.002864319831132889,
 0.051561832427978516,
 -0.025599537417292595,
 -0.0011596139520406723,
 -0.04967088997364044,
 0.060287389904260635,
 -0.03348660469055176,
 0.048760365694761276,
 0.01405520923435688,
 -0.10338598489761353,
 -0.03138543665409088,
 -0.02268635854125023,
 0.021454308182001114,
 -0.02400192990899086,
 0.05053939297795296,
 -0.02882087416946888,
 0.01668699085712433,
 -0.04159032180905342,
 -0.017467493191361427,
 -0.10009855031967163,
 -0.007026635110378265,
 -0.04294290766119957,
 0.0015502250753343105,
 0.014490

In [35]:
def divide_to_chapter(text):
    chapter_pattern = r'(?i)^chapter\s+[a-z]+'
    titles = re.findall(chapter_pattern, text, re.MULTILINE)
    splits = re.split(chapter_pattern, text, flags=re.MULTILINE)
    chapters = [f"{title}\n{body.strip()}" for title, body in zip(titles, splits[1:])]
    return chapters

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [36]:
book_path = 'hp/harry_potter_1.txt'
text = read_text_file(book_path)
chapters = divide_to_chapter(text)

In [37]:
chapter = chapters[0]

In [38]:
# remove the chapter title and white spaces
chapter = re.sub(r"^(CHAPTER \w+\n)(.*\n\n)", "", chapter, flags=re.MULTILINE)
chapter = chapter.strip()

In [41]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [51]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)

In [52]:
chunks = text_splitter.split_text(chapter)

In [53]:
len(chunks)

35

In [54]:
chunks

['Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.',
 'The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for seve

In [39]:
# create chunks based on sentences
sentences = sent_tokenize(chapter)
chunk_size = 5
chunks = [
        " ".join(sentences[i : i + chunk_size])
        for i in range(0, len(sentences), chunk_size)
    ]

In [40]:
chunks

['Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.',
 'The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several

In [None]:
# get embeddings
inputs = embedding_tokenizer(chunks, padding=True, truncation=True, return_tensors="pt").to(1)

In [None]:
outputs = embedding_model(**inputs)

In [None]:
# Mean pooling
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

In [None]:
embeddings = mean_pooling(outputs[0], inputs['attention_mask']).cpu().detach().numpy()

In [None]:
embeddings.shape

In [None]:
directory = 'hp_vdbs/hp'
client = chromadb.PersistentClient(path=directory)
collection = client.create_collection("hp1")

In [None]:
chapter_id = 1

In [None]:
collection.add(
    documents=chunks,
    embeddings=embeddings.tolist(),
    metadatas=[{"chapter": 1}] * len(chunks),
    ids=[f"ch{chapter_id}_doc{i+1}" for i in range(len(chunks))]
)

In [None]:
collection.count()

In [None]:
results = collection.query(
    query_embeddings=embeddings[0].tolist(),
    n_results=3,
    where={"chapter": 1}
)

In [None]:
results['documents']

In [None]:
c = chromadb.PersistentClient(path=directory)
cc = c.get_collection("hp1")

In [None]:
results = cc.query(
    query_embeddings=embeddings[0].tolist(),
    n_results=3,
    where={"chapter": 1}
)

In [None]:
results

In [None]:
c.delete_collection("hp1")

: 

In [12]:
import chromadb

In [13]:
directory = 'hp_vdbs/hp'
client = chromadb.PersistentClient(path=directory)
collection = client.get_collection("book")

In [14]:
collection.count()

1150