based on  https://youtu.be/inAY6M6UUkk?si=g7fm5aaTU1VWJTIa (modified to run locally, though)

In [2]:
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from sklearn.neighbors import NearestNeighbors
import json
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Tests (to peruse 👀)

## Load and split documents into chunks

In [105]:
savedir = "/Users/isaac/Desktop/Creative/Coding/NLP/Question Answering/WarframeTextData"

load documents into langchain

In [12]:
loader = DirectoryLoader(savedir, glob="*.txt", loader_cls=TextLoader)
documents = loader.load()
len(documents)

55

split documents into model-input-sized chunks

In [13]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 100)

document_chunks = text_splitter.split_documents(documents)

print(f"Number documents {len(documents)}")
print(f"Number chunks {len(document_chunks)}")

document_chunks=[f"Context: {chunk.page_content} Source: {chunk.metadata['source']}" for chunk in document_chunks]

Created a chunk of size 1548, which is longer than the specified 1000
Created a chunk of size 1544, which is longer than the specified 1000
Created a chunk of size 1559, which is longer than the specified 1000
Created a chunk of size 1046, which is longer than the specified 1000
Created a chunk of size 1854, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1151, which is longer than the specified 1000


Number documents 55
Number chunks 3474


## Create embeddings for langchain chunks

In [83]:
save_chunk_dir = "/Users/isaac/Desktop/Creative/Coding/NLP/Question Answering/WarframeDataChunked"

In [15]:
embeddings = HuggingFaceEmbeddings()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
df = pd.DataFrame(document_chunks, columns =['text'])

index_embeddings = []

for index, doc in tqdm(df.iterrows(), total=len(df)-1, desc="Get embeddings and write documents"):
  # print(f"Get embedding and write document for document {index} of {len(df)-1}")
  embedding = embeddings.embed_query(doc["text"])

  if embedding is not None:
    doc_id=f"{index}.txt"
    embedding_dict = {
              "id": doc_id,
              "embedding": [str(value) for value in embedding],
    }
    index_embeddings.append(json.dumps(embedding_dict) + "\n")

    doc_id = f"{index}.txt"
    with open(f"{save_chunk_dir}/{doc_id}", "w") as document:
      document.write(doc['text'])


with open("embeddings.json", "w") as f:
    f.writelines(index_embeddings)

Get embeddings and write documents: 3474it [09:58,  5.80it/s]                          


## index embeddings

load embeddings as numpy array

In [18]:
embeddings_json = "/Users/isaac/Desktop/Creative/Coding/NLP/Question Answering/embeddings.json"

In [67]:
file = open(embeddings_json, 'r')

In [68]:
line = file.readline()

In [56]:
line = file.readline()
full_array = []
while line:
    embed = json.loads(line)['embedding']
    full_array.append(embed)
    line = file.readline()
embeddings_array = np.array(full_array, dtype=np.float32)
print(embeddings_array.shape)

(3474, 768)


create nearest neighbors graph

In [58]:
nbrs = NearestNeighbors(n_neighbors=8)
nbrs.fit(embeddings_array)

run similarity search

In [59]:
question = "What are Rhino's four abilities?"

In [60]:
embedding = embeddings.embed_query(question)

In [65]:
distances, indices = nbrs.kneighbors([embedding])

In [66]:
print(distances, indices)

[[0.81408345 0.84259021 0.84714009 0.92791968 0.9284121  0.93362898
  0.94002151 0.95236558]] [[1646 1641 1642 1690 1693 1692 1691 1655]]


## generate text from prompt based on search result/context/link

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

In [75]:
file = open("/Users/isaac/Desktop/Creative/Coding/NLP/Question Answering/WarframeDataSplit/1646.txt", 'r')
context = file.read()
question = "what abilities does Rhino have?"

In [76]:
prompt=f"""
Follow exactly those 3 steps:
1. Read the context below and aggregrate this data
Context : {context}
2. Answer the question using only this context
3. Show the source for your answers
User Question: {question}


If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
"""

In [77]:
model_input = tokenizer(prompt, return_tensors="pt").input_ids


In [79]:
model_input = tokenizer(prompt, return_tensors="pt").input_ids
model_output = model.generate(model_input, max_new_tokens=500)
print(tokenizer.decode(model_output[0]))

<pad> 1st Ability Rhino Charge 2nd Ability Iron Skin 3rd Ability Roar 4th Ability Rhino Stomp</s>


# WHOLE LOOP FOR DOCUMENT RETRIEVAL!

## First Time Ever

In [2]:
from get_fandom_data import run_download_warframe_data # check out ./get_fandom_data.py

In [25]:
warframe_text_folder = Path.cwd() / "WarframeTextData"

In [6]:
run_download_warframe_data(warframe_text_folder) # download warframe data from pages on https://warframe.fandom.com/

100%|██████████| 55/55 [02:55<00:00,  3.19s/it]


In [26]:
# loading all warframe documents into langchain document array
loader = DirectoryLoader(warframe_text_folder, glob="*.txt", loader_cls=TextLoader)
documents = loader.load()
print(f"number of warframe documents found: {len(documents)}")

number of warframe documents found: 55


In [27]:
# splitting documents into manageable-sized chunks (thanks langchain!)
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 100)

document_chunks = text_splitter.split_documents(documents)

print(f"{len(documents)} documents were split into {len(document_chunks)} chunks")

document_chunks=[f"Context: {chunk.page_content} Source: {chunk.metadata['source']}" for chunk in document_chunks]

Created a chunk of size 1548, which is longer than the specified 1000
Created a chunk of size 1544, which is longer than the specified 1000
Created a chunk of size 1559, which is longer than the specified 1000
Created a chunk of size 1046, which is longer than the specified 1000
Created a chunk of size 1854, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1151, which is longer than the specified 1000


55 documents were split into 3475 chunks


In [35]:
embeddings = HuggingFaceEmbeddings() # load huggingface text embeddings (transform documents to numbers for later comparison)

In [32]:
# setting up chunks folder
warframe_text_chunks_folder = Path.cwd() / "WarframeSplitData"
if not warframe_text_chunks_folder.exists():
    warframe_text_chunks_folder.mkdir(parents=True, exist_ok=True)

In [None]:
# embedding all document text chunks into huggingface-embedded documents (text at ./WarframeSplitData, embeddings at ./embeddings.json)
df = pd.DataFrame(document_chunks, columns =['text'])
index_embeddings = []

for index, doc in tqdm(df.iterrows(), total=len(df)-1, desc="Get embeddings and write documents"):
  print(f"Get embedding and write document for document {index} of {len(df)-1}")
  embedding = embeddings.embed_query(doc["text"])

  if embedding is not None:
    doc_id=f"{index}.txt"
    embedding_dict = {
              "id": doc_id,
              "embedding": [str(value) for value in embedding],
    }
    index_embeddings.append(json.dumps(embedding_dict) + "\n")

    doc_id = f"{index}.txt"
    with open(f"{warframe_text_chunks_folder}/{doc_id}", "w") as document:
      document.write(doc['text'])


with open("embeddings.json", "w") as f:
    f.writelines(index_embeddings)

## Every Time

### Run Once

In [3]:
embeddings = HuggingFaceEmbeddings() # loading huggingface embeddings for text generator model

In [4]:
embeddings_json = Path.cwd() / "embeddings.json" # getting embedded data from previous steps
file = open(embeddings_json)
line = file.readline()
full_array = []
while line: # loading embeddings into memory -> numpy array
    embed = json.loads(line)['embedding']
    full_array.append(embed)
    line = file.readline()
embeddings_array = np.array(full_array, dtype=np.float32)
print(embeddings_array.shape) # verifying correct shape (should be [N (number of chunks) X 768 (number of embedding dimensions)])

(3474, 768)


In [32]:
# creating k nearest neighbors object
n_neighbors = 8
nbrs = NearestNeighbors(n_neighbors=n_neighbors)
nbrs.fit(embeddings_array)

In [6]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large") # loading tokenizer for flan
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large") # loading flan text generation model

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
# important paths used later
warframe_text_chunks_folder = Path.cwd() / "WarframeSplitData"
warframe_text_folder = Path.cwd() / "WarframeTextData"
sources_dict = open(warframe_text_folder / "SOURCES.json")
sources_dict = json.loads(sources_dict.read())

### Run Each Query

In [33]:
amount_of_context = 3 # number of relevant documents to use to answer question
question = "What are all of Excalibur's abilities?" # question input to model


embedding = embeddings.embed_query(question) # embed question to latent space using huggingface embeddings
distances, indices = nbrs.kneighbors([embedding]) # get 8 most similar

context = ""
for i in range(min([amount_of_context, len(indices[0]), n_neighbors] )): # iterate over each found document
    file = open(warframe_text_chunks_folder / f"{indices[0][i]}.txt", 'r')
    context += file.read() # open its text

prompt=f"""
Follow exactly those 3 steps:
1. Read the context below and aggregrate this data
Context : {context}
2. Answer the question using only this context
3. Show the source for your answers
User Question: {question}

If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
"""

model_input = tokenizer(prompt, return_tensors="pt").input_ids # tokenizing prompt for model
model_output = model.generate(model_input, min_length=100, max_length=2000) # generating response
text_output = tokenizer.decode(model_output[0]) # decoding response

print(text_output) # printing results

## attempt at replacing local file path with weblink
# source_start = text_output.find("Source: ")
# source_end = text_output.find(".txt</s>")
# web_source = sources_dict[text_output[source_start+8:source_end-4]]
# print(text_output[:source_start] + web_source)

<pad> Slash Dash Radial Blind Radial Javelin Exalted Blade Passive Excalibur deals 10% increased damage and attacks 10% faster when wielding swords. Abilities 1st Ability Slash Dash 2nd Ability Radial Blind 3rd Ability Radial Javelin 4th Ability Exalted Blade General Information Sex Male Mastery Rank 0 Max Rank 30 Health 270 (370 at Rank 30) Shields 270 (370 at Rank 30) Armor 240 Energy 100 (150 at Rank 30) Starting Energy 50 Sprint Speed 1 Aura Polarity Exilus Polarity None Polarities Introduced Vanilla (2012-10-25) Themes Swordsman Progenitor Element Electricity Subsumed Ability Radial Blind Tactical Ability Radial Blind</s>


TODO
- documentation
- functionize things
- force it to print "context" or upgrade language model to better perform?
- reformat input text?
- requirements.txt