<a href="https://colab.research.google.com/github/KiranVarghese25/Project/blob/main/LLM_project_2348525.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import subprocess

# List of packages to install
packages = [
    'langchain',
    'google-search-results',
    'sentence_transformers',
    'faiss-cpu',
    'langchain-community'
]

# Install packages
for package in packages:
    subprocess.check_call(["pip", "install", package])


In [None]:
!pip install --upgrade pydantic langchain transformers sentence-transformers faiss-cpu


Collecting langchain
  Using cached langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Using cached langchain_core-0.2.35-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Using cached langsmith-0.1.104-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached langchain-0.2.14-py3-none-any.whl (997 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached sentence_transformers-3.0.1-py3-

In [None]:
from google.colab import files
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Upload the CSV file
uploaded = files.upload()

# Check the uploaded files
for file_name in uploaded.keys():
    print(f"Uploaded file: {file_name}")

# Load the CSV file into a DataFrame, specifying the encoding
df = pd.read_csv(next(iter(uploaded.keys())), encoding='latin-1')  # Try 'latin-1' or 'Windows-1252'
print(df.head())

# Initialize SentenceTransformer for embeddings
model_name = "hkunlp/instructor-large"
sentence_model = SentenceTransformer(model_name)
vectordb_file_path = "faiss_index"

def create_vector_db():
    # Load data from DataFrame
    documents = df['prompt'].tolist()  # Extract 'prompt' column as list

    # Compute embeddings for documents
    embeddings = sentence_model.encode(documents, convert_to_numpy=True)

    # Create a FAISS index
    dimension = embeddings.shape[1]
    nlist = min(100, len(documents))  # Number of clusters should be at most the number of documents
    quantizer = faiss.IndexFlatL2(dimension)  # Flat quantizer
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

    # Train the index
    index.train(embeddings)

    # Add embeddings to the index
    index.add(embeddings)

    # Save FAISS index locally
    faiss.write_index(index, vectordb_file_path)

def get_qa_chain():
    # Load the FAISS index from the local folder
    index = faiss.read_index(vectordb_file_path)

    # Define a function to query the FAISS index
    def query_faiss(query, k=5):
        # Compute query embedding
        query_embedding = sentence_model.encode([query], convert_to_numpy=True)
        # Search for nearest neighbors
        distances, indices = index.search(query_embedding, k)
        return indices, distances

    return query_faiss

def get_responses(indices):
    # Map indices to responses
    responses = [df.iloc[i]['prompt'] for i in indices[0]]
    return responses

if __name__ == "__main__":
    create_vector_db()
    query_function = get_qa_chain()
    query = "Do you have javascript course?"
    indices, distances = query_function(query)
    responses = get_responses(indices)

    # Print responses
    print(f"Query: {query}")
    print("Top responses:")
    for response in responses:
        print(response)


Saving codebasics_faqs.csv to codebasics_faqs (6).csv
Uploaded file: codebasics_faqs (6).csv
                                              prompt  \
0  I have never done programming in my life. Can ...   
1                     Why should I trust Codebasics?   
2  Is there any prerequisite for taking this boot...   
3  What datasets are used in this bootcamp? Is it...   
4  Im not sure if this bootcamp is good enough f...   

                                            response  
0  Yes, this is the perfect bootcamp for anyone w...  
1  Till now 9000 + learners have benefitted from ...  
2  Our bootcamp is specifically designed for begi...  
3  The datasets used in this bootcamp are crafted...  
4  We got you covered. Go ahead and watch our you...  


  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


Query: Do you have javascript course?
Top responses:
I have never done programming and belong to a non-technical background. Can I take this course?
How do I enable Power Pivot before using it for the first time ?
How do I enable Power Pivot before using it for the first time ?
How do I enable Power Pivot before using it for the first time ?
How do I enable Power Pivot before using it for the first time ?
