In [None]:
!pip install pdfplumber
!pip install langchain-text-splitter
!pip install sentence_transformers
!pip install faiss-cpu
!pip install groq


Conversion to plain text

In [14]:
import os
import pdfplumber

# Ensure the folder exists
output_folder = "textconversion"
os.makedirs(output_folder, exist_ok=True)

# Define the output file path
output_path = os.path.join(output_folder, "output.txt")

# Extract text and write to the file
with pdfplumber.open("Company-Policy-and-Procedure-June-1.18-V6.0.pdf") as pdf, open(output_path, "w", encoding="utf-8") as f:
    for page in pdf.pages:
        t = page.extract_text()
        if t:
            f.write(t + '\n')


In [4]:
with open("/content/textconversion/output.txt", "r", encoding="utf-8") as document:
    text = document.read()

Text Splitting

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

texts = text_splitter.split_text(text)


Conversion of Chunks to Embeddings

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
vectors = model.encode(texts)

# Ensure the folder exists
vector_folder = "vectors"
os.makedirs(vector_folder, exist_ok=True)

# Save the vectors
output_path = os.path.join(vector_folder, "embeddings.npy")
np.save(output_path, vectors)

Store Embeddings

In [17]:
import faiss
import numpy as np

dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(vectors))

embedding_folder = "embeddings"
os.makedirs(embedding_folder, exist_ok=True)


index_path = os.path.join(embedding_folder, "faiss_index.index")
faiss.write_index(index, index_path)



Query Search

In [None]:
query = "Age to acess the TL Website?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=1)

for idx in I[0]:
    print(f"Match: {texts[idx]}")

Match: 18
Our Commitment to Data Security
Access to your data is limited to authorized TL staff or approved vendors. Although total security does not
exist on the Internet, TL shall make commercially reasonable efforts to safeguard the information that you
submit to TL or that TL collects.
Use of the TL Website by Children
The TL Website is not intended for use by children under the age of 13.
Your Privacy Preferences


Importing GROQ

In [None]:
import os
from groq import Groq

client = Groq(api_key="add your own")


Query Searching and LLM Generation

In [11]:
query = "what is im not 18 to use the TL website?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=3)

context = texts[I[0][0]]

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""


In [12]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)

You are not allowed to use the TL website if you are under the age of 13.
