In [84]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from sentence_transformers import SentenceTransformer
from qdrant_client import models, QdrantClient
import os

path = "/Users/kge/Desktop/txts/"
dir_list = os.listdir(path)

bad_files = ["file258.txt", "file259.txt", "file460.txt", "file514.txt", 
             "file23.txt", "file840.txt", "file658.txt", "file763.txt", 
             "file617.txt", "file53.txt", "file373.txt", "file831.txt",
             "file628.txt", "file41.txt", "file412.txt", "file411.txt",
             "file43.txt", "file178.txt", "file434.txt", "file99.txt", 
             "file421.txt", "file233.txt", "file227.txt", "file582.txt", 
             "file220.txt", "file618.txt", "file156.txt", "file355.txt", 
             "file433.txt", "file157.txt", "file553.txt", "file155.txt", 
             "file395.txt", "file330.txt", "file333.txt", "file131.txt", 
             "file726.txt", "file484.txt"]

texts = []
for name in dir_list:
    if name[-4:] == ".txt":
        if name not in bad_files:
            loader = UnstructuredFileLoader(path + name)
            docs = loader.load()

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size = 500,
                chunk_overlap  = 20,
                length_function = len,
            )
            texts += text_splitter.split_documents(docs)

In [69]:
from langchain.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
print("embeddings imported")

#we're going to use Qdrant for our vector database
url = "<http://localhost:6333>"
qdrant = Qdrant.from_documents(
    texts, embeddings, 
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)
print("started Qdrant")

embeddings imported
started Qdrant
Query is:  What does TRACON mean?
query vectorized
similarity search done


In [73]:
#setting a test query
query = "What sort of descent techniques do regional and small jet pilots use that cause predicting their descent profiles to be difficult, and why?"
print("Query is: ", query)

#vectorize the query
query_result = embeddings.embed_query(query)
print("query vectorized")

#we do a similarity search on the documents and query, getting
#the scores and documents, then print the top one
found_docs = qdrant.similarity_search_with_score(query)
print("similarity search done")

Query is:  What sort of descent techniques do regional and small jet pilots use that cause predicting their descent profiles to be difficult, and why?
query vectorized
similarity search done


In [74]:
from sentence_transformers import CrossEncoder

print("\n----------------------")
print("Top 3 bi-encoder retrieval hits\n")
re_rank_input = []
for i in range(len(found_docs)):
    document, score = found_docs[i]
    print(document.page_content)
    print(document.metadata.values())
    print(f"\nScore: {score}\n")
    re_rank_input.append([query, document.page_content])

    
# rerank the top choices
print("\n-------------------")
print("Top 3 cross-encoder hits\n")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
re_rank_scores = cross_encoder.predict(re_rank_input)
re_rank = {}
for i in range(len(re_rank_scores)):
    re_rank[re_rank_scores[i]] = re_rank_input[i][1]

keys = list(re_rank.keys())
keys.sort(reverse = True)
sorted_rerank = {i: re_rank[i] for i in keys}
for i in sorted_rerank:
    print(sorted_rerank[i])
    print(f"\nScore: {i}\n")


----------------------
Top 3 bi-encoder retrieval hits

restrictions are issued by controllers for metering and spacing, the nominal descent plan can become inefficient and difficult, if not sometimes impossible, to fly in strong tailwinds.In addition, random observations of regional jet operations and pilot interviews revealed that a large variety of descent-planning techniques are used by pilots, even for the same equipment.These techniques vary in terms of the selection of descent angle, bottom-of-descent planning, and top-of-descent
dict_values(['file795.txt'])

Score: 0.625072615572405

test conditions employed in both tests were designed to provide a reasonable representation of commercial airline jet transport descents as anticipated in a CTAS Descent Advisor operational environment.Cockpit automation and the corresponding pilot procedures were studied to investigate their impact on the descent trajectory.The NASA test pilots were instructed to fly the descents as precisely as 

In [83]:
loader = UnstructuredFileLoader("file258.txt")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
                chunk_size = 500,
                chunk_overlap  = 20,
                length_function = len,
            )

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4726: character maps to <undefined>