In [1]:
import faiss
from langchain_community.document_loaders import DirectoryLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [2]:
DATA_DIR = r"E:\Python\LLM\NeuroHarshit\Databases\text_data"
EMBEDDING_MODEL = 'text-embedding-3-large'

## Loading directory

In [3]:
loader = DirectoryLoader(
    path= DATA_DIR,
    show_progress= True,
    use_multithreading= True
)
docs = loader.load()

  0%|          | 0/8 [00:00<?, ?it/s]

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
Need to load profiles.
Need to load profiles.
short text: "Certifications:". Defaulting to English.
short text: "Education:". Defaulting to English.
Need to load profiles.
 12%|█▎        | 1/8 [00:05<00:40,  5.80s/it]short text: "Duration: 2022 - 2025". Defaulting to English.
short text: "CGPA: 8.64". Defaulting to English.
short text: "Score: 90%". Defaulting to English.
short text: "Year of completion: 2022". Defaulting to English.
short text: "Score: 91%". Defaulting to English.
libmagic is unavailable but assists in filetype detec

In [4]:
len(docs)

8

## Chunking

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size= 500,
    chunk_overlap= 50
)

chunks = splitter.split_documents(docs)

In [6]:
len(chunks)

36

## Vector DB

In [7]:
embeddings = OpenAIEmbeddings(model= EMBEDDING_MODEL)
sample = embeddings.embed_query('hello world')
length = len(sample)

In [8]:
length

3072

In [9]:
index = faiss.IndexFlatL2(length)

In [10]:
vector_store = FAISS(
    embedding_function= embeddings,
    index= index,
    docstore= InMemoryDocstore(),
    index_to_docstore_id= {}
)

In [11]:
vector_store.add_documents(chunks)

['c4c55a01-e74c-419f-ab8d-4430228472e6',
 '14345b4e-f7ca-4fe4-a976-b08bc93fcc8d',
 '65023599-97dd-4ec2-bb60-12532d9de0b8',
 'e13ad03b-cd02-4f0e-bafa-f624f8856501',
 '2b749e2e-c8b6-4e3f-8b9a-88dbd69ff4fe',
 'cd7ab3ed-fcd8-466b-8d50-713d81aeb78f',
 '209e7c9c-5e8e-42ff-91f0-150c1d54bcd8',
 '5aca2738-b35a-4a14-a9e9-55a048604d8b',
 'e54cfcbf-eda6-4bde-ac4d-b52ab91fa582',
 'a1ebc0bd-aef7-4f4e-8b08-e6fa2e27479e',
 '7705c5ff-6659-443f-9540-08b4f9bfeec8',
 'bf76d1a6-1fc7-4b84-8e86-58cb8fa1a0da',
 'fa7c85f8-82ee-4992-9c1f-5e35fab0735a',
 '990e46e0-a922-4b56-a177-24652102497d',
 'd9c125bc-0472-42a6-a8e4-08efb61857e0',
 '6e6f574d-5ad0-4735-a9f1-7a1c5be8089d',
 'dd9d7321-92c2-4f23-9f5e-34e00463ac2f',
 'cd793284-9695-4e08-b6c9-7fa5f7fed275',
 'cdfc9bd5-45f5-4586-89e3-ad577204362c',
 '186ae497-fb21-4ddd-a363-0dfddbce23c7',
 '7b69d2f1-9af6-44d4-941e-5e33d6e06fe1',
 '5d4d17f8-ac7d-4a27-92b1-ea9cc4091f53',
 '7b342c3f-c54a-4a1e-a117-e84ad88ada65',
 'bedb0e50-2236-4e79-b4f0-b7d4a0b4e3d6',
 'b79523c1-9176-

In [12]:
vector_store.save_local(r'E:\Python\LLM\NeuroHarshit\Databases\faiss_index')