In [10]:
import os
import requests
import mimetypes
import fitz  # PyMuPDF for PDF extraction
import docx
from bs4 import BeautifulSoup
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import tempfile
import shutil

In [5]:
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def extract_text_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()
    return "Failed to fetch content."

def analyze_input(user_input):
    if user_input.startswith("http://") or user_input.startswith("https://"):
        return "url"
    elif os.path.exists(user_input):
        mime_type, _ = mimetypes.guess_type(user_input)
        if mime_type:
            if "pdf" in mime_type:
                return "pdf"
            elif "officedocument.wordprocessingml.document" in mime_type:
                return "docx"
            elif "text/plain" in mime_type:
                return "txt"
    return "unknown"

def process_input(user_input):
    input_type = analyze_input(user_input)
    
    if input_type == "pdf":
        text = extract_text_from_pdf(user_input)
    elif input_type == "docx":
        text = extract_text_from_docx(user_input)
    elif input_type == "txt":
        text = extract_text_from_txt(user_input)
    elif input_type == "url":
        text = extract_text_from_url(user_input)
    else:
        text = "Unsupported file type or invalid input."
    
    print("Extracted Text:\n", text)
    return text

In [6]:
def build_vector_store(text):
    documents = [Document(page_content=text)]
    
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

if __name__ == "__main__":
    user_input = input("Enter file path or URL: ")
    
    input_text = process_input(user_input)
    
    vector_store = build_vector_store(input_text)
    
    print("FAISS index created successfully!")

Extracted Text:
 




Introduction | 🦜️🔗 LangChain






Skip to main contentJoin us at  Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!IntegrationsAPI ReferenceMoreContributingPeopleError referenceLangSmithLangGraphLangChain HubLangChain JS/TSv0.3v0.3v0.2v0.1💬SearchIntroductionTutorialsBuild a Question Answering application over a Graph DatabaseTutorialsBuild a simple LLM application with chat models and prompt templatesBuild a ChatbotBuild a Retrieval Augmented Generation (RAG) App: Part 2Build an Extraction ChainBuild an AgentTaggingBuild a Retrieval Augmented Generation (RAG) App: Part 1Build a semantic search engineBuild a Question/Answering system over SQL dataSummarize TextHow-to guidesHow-to guidesHow to use tools in a chainHow to use a vectorstore as a retrieverHow to add memory to chatbotsHow to use example selectorsHow to add a semantic layer over graph databaseHow to invoke runnables in parallelHow to stream chat model responsesHow to add de

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



FAISS index created successfully!


In [14]:
with tempfile.TemporaryDirectory() as temp_dir:
        save_path = os.path.join(temp_dir, "faiss_index")
        
        # Save FAISS index in the temporary directory
        vector_store.save_local(save_path)
        print(f"Vector store temporarily saved at: {save_path}")

Vector store temporarily saved at: C:\Users\AHMEDK~1\AppData\Local\Temp\tmpo42h7ykh\faiss_index


In [15]:
print("\nFAISS Index Details:")
# Print the index object
print(vector_store.index)

#loaded_vector_store = FAISS.load_local(save_path, vector_store.embedding_function)
# Print the total number of vectors in the index
print("Number of vectors in the index:", vector_store.index.ntotal)


FAISS Index Details:
<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x00000279259D1E30> >
Number of vectors in the index: 1


In [11]:
shutil.rmtree(save_path, ignore_errors=True)
        
        
print(f"Temporary FAISS index at '{save_path}' deleted successfully.")

Temporary FAISS index at 'C:\Users\AHMEDK~1\AppData\Local\Temp\tmpc984zwle\faiss_index' deleted successfully.
