In [3]:
from pdfUtils.pdf_utilities import extract_pdf, clean_text

In [4]:
file_path = './Rome.pdf'

In [6]:
text = extract_pdf(file_path)


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [8]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=500,
    chunk_overlap=100
)

In [9]:
character_split_texts = character_splitter.split_text(text)

In [10]:
sentence_splitter = SentenceTransformersTokenTextSplitter(
    tokens_per_chunk=256,
    chunk_overlap=0
)

  from tqdm.autonotebook import tqdm, trange







In [11]:
token_split_texts = []
for text in character_split_texts:
    token_split_texts.extend(sentence_splitter.split_text(text))

In [12]:
print(type(token_split_texts[0]))

<class 'str'>


In [13]:
query = "In which country is Rome located?"

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForQuestionAnswering

import torch

# Load pre-trained model and tokenizer
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def generate_embeddings(chunks):
    inputs = tokenizer(chunks, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

In [15]:
import faiss

embedding_dim = 384
index = faiss.IndexFlatIP(embedding_dim)

In [16]:
chunks = token_split_texts

In [17]:
embeddings = generate_embeddings(chunks)

In [18]:
index.add(embeddings)

In [19]:
# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Generate embeddings for the query
query_embeddings = generate_embeddings([query])


In [20]:
def get_best_match(query):
    query_embeddings = generate_embeddings([query])
    D, I = index.search(query_embeddings, 1)
    return chunks[I[0][0]]

In [22]:
query = "Which emperor made Christianity the official religion of the Roman Empire?"
print(get_best_match(query))

roman legal practices 5. religion and philosophy christianity ' s rise christianity emerged during the late roman republic and gained prominence under emperor constantine in the early 4th century ad when it was declared the state religion. the establishment of christianity significantly altered rome ' s cultural landscape. the construction of monumental churches like st. peter ' s basilica signified rome ' s new role as a center for christian worship 1 3. philosophical thought


In [1]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

def answer_question(question, context):
    # Encode the question and context
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors='pt')

    # Get input IDs and attention mask
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Forward pass to get logits
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

    # Get the most likely start and end positions
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    # Convert the input IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Extract the answer from the context
    answer = tokens[start_index:end_index + 1]
    answer = tokenizer.convert_tokens_to_string(answer)

    return answer.strip()

# Example usage
context = "Hugging Face is creating a tool that democratizes AI. Their mission is to make AI accessible to everyone."
question = "What is Hugging Face creating?"

answer = answer_question(question, context)
print("Answer:", answer)



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: a tool that democratizes ai


In [4]:
# Example usage
context = "OpenAI is a research organization that focuses on artificial intelligence. It has developed several models, including GPT-3 and GPT-4."
question = "What models have OpenAI developed?"
answer = answer_question(context, question)
print(f"Answer: {answer}")

Answer: gpt - 3 and gpt - 4
