# Building Robust Large Text Q&A Systems with BERT and ChatGPT 3.5 Turbo

In this Notebook we're working on Knowledge Base Creation and Storing the embeddings into disk.

In [2]:
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import numpy as np
import openai
import os

  from .autonotebook import tqdm as notebook_tqdm


# Extracting Text

For this example we are taking text from the well known book called Harry Potter and the Chamber of Secrets.
You can find it from the Data folder.

In [7]:
with open(r'C:\Users\User\Desktop\text qna\data\Harry_Potter_and_the_Chamber_of_Secrets.txt', 'r', encoding='utf-8') as f:
    text = f.read()

A. Making Sentences

In [8]:
def make_sentences(text):
    list_of_sentences = []
    sentences = text.split('.')
    for sentence in sentences:
        list_of_sentences.append(sentence)
    return list_of_sentences

B. Making Paragraphs

In [None]:
def make_paragraphs(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = [words[i:i+chunk_size] for i in range(0, len(words), chunk_size-overlap)]
    return [' '.join(chunk) for chunk in chunks]

C. Making Embeddings

In [None]:
def make_embeddings(text):
    
    #making chunks of text
    sentences = make_sentences(text)
    paragraphs = make_paragraphs(text)
    
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Preprocess chunks and convert to vectors using BERT
    
    #Making Sentence Embeddings
    sentence_embeddings = []
    for sentence in tqdm(sentences,desc="Creating Sentence Embeddings"):
      encoded_input = tokenizer(sentence, return_tensors='pt',max_length=512, truncation=True, padding='max_length')
      with torch.no_grad():
          model_output = model(**encoded_input)
      embedding = torch.mean(model_output.last_hidden_state, dim=1).squeeze().numpy()
      sentence_embeddings.append(embedding.tolist())
    sentence_embedding_dict = dict(zip(sentences, sentence_embeddings))

    #Making Paragraph Embeddings
    paragraph_embeddings = []
    for paragraph in tqdm(paragraphs,desc="Creating Paragraph Embeddings"):
      encoded_input = tokenizer(paragraph, return_tensors='pt',max_length=512, truncation=True, padding='max_length')
      with torch.no_grad():
          model_output = model(**encoded_input)
      embedding = torch.mean(model_output.last_hidden_state, dim=1).squeeze().numpy()
      paragraph_embeddings.append(embedding.tolist())
    paragraph_embedding_dict = dict(zip(paragraphs, sentence_embeddings))
    
    return sentence_embedding_dict, paragraph_embedding_dict

In [9]:
#Making Embeddings
sentence_embeddings, paragraphs_embeddings = make_embeddings(text)

Downloading model.safetensors: 100%|██████████| 440M/440M [02:27<00:00, 2.99MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Creating Sentence Embeddings: 100%|██████████| 6085/6085 [3:14:40<00:00,  1.92s/it]  
Creating Paragraph Embeddings: 100%|██████████| 200/200 [04:28<00:00,  1.34s/it]


D. Storing the embeddings into disk

In [11]:
import json
with open(r'C:\Users\User\Desktop\text qna\embeddings\sentence_embeddings.json', "w") as json_file:
    json.dump(sentence_embeddings, json_file)
    
with open(r'C:\Users\User\Desktop\text qna\embeddings\paragraph_embeddings.json', "w") as json_file:
    json.dump(paragraphs_embeddings, json_file)