In [2]:
# Install necessary libraries
!pip install langchain==0.3.0 llama-index==0.12.0 openai faiss-cpu pandas


Collecting langchain==0.3.0
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting llama-index==0.12.0
  Downloading llama_index-0.12.0-py3-none-any.whl.metadata (11 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.3.0)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain==0.3.0)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index==0.12.0)
  Downloading llama_index_agent_openai-0.4.1-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index==0.12.0)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.0 (from llama-index==0.12.0)
  Downloading llama_index_core-0.12.9-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index==0.12.0)
  Downloading llama_index_em

In [4]:
!pip install sentence-transformers faiss-cpu pandas textblob

import pandas as pd
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name, encoding='ISO-8859-1')

data['description'] = data['description'].fillna('')
data['title'] = data['title'].fillna('')
data['combined_text'] = data['title'] + " " + data['description']

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
data['embedding'] = data['combined_text'].apply(lambda x: embedding_model.encode(x))

import faiss
import numpy as np
dimension = len(data['embedding'][0])
index = faiss.IndexFlatL2(dimension)
embeddings = np.vstack(data['embedding'].values)
index.add(embeddings)

print(f"Number of items in the FAISS index: {index.ntotal}")

from textblob import TextBlob

def correct_spelling(query):
    corrected_query = str(TextBlob(query).correct())
    if corrected_query != query:
        print(f"Did you mean: '{corrected_query}'?")
    return corrected_query

def search_courses(query, top_k=5):
    query_embedding = embedding_model.encode(query)
    distances, indices = index.search(np.array([query_embedding]), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(data):
            course = data.iloc[idx]
            results.append({
                'title': course['title'],
                'description': course['description'],
                'distance': distances[0][list(indices[0]).index(idx)]
            })
    return results

while True:
    query = input("\nEnter your search query (or type 'exit' to quit): ").strip()

    if query.lower() == 'exit':
        print("Exiting the search system. Goodbye!")
        break

    corrected_query = correct_spelling(query)
    results = search_courses(corrected_query, top_k=5)
    print("\nSearch Results:")
    if results:
        for i, result in enumerate(results):
            print(f"\nResult {i + 1}")
            print(f"Title: {result['title']}")
            print(f"Description: {result['description']}")
            print(f"Distance: {result['distance']:.4f}")
    else:
        print("No results found for your query. Try another search.")


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [14]:
import os
from pinecone import Pinecone, Index, ServerlessSpec

PINECONE_API_KEY = "pcsk_eqodE_F1mdBzWUBbN2xUTJccYYv9n5u5oqervyzAkqCdWRsmdqzZSfEb9Eu8CpUYpiL7V"
PINECONE_ENV = "us-west-1"

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "vidhyanalytics"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",  # Metric for similarity search
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENV
        )
    )

# Get the host of the created index
index_host = pc.describe_index(index_name).host

# Connect to the existing index
index = Index(name=index_name, host=index_host, api_key=PINECONE_API_KEY)  # Provide the API key
print(f"Connected to Pinecone index: {index_name}")


Connected to Pinecone index: vidhyanalytics


Saving analyticsvidhya_courses_full_simulated.csv to analyticsvidhya_courses_full_simulated (2).csv


In [16]:
!pip install langchain==0.3.0 sentence-transformers pinecone-client pandas textblob


Collecting langchain==0.3.0
  Using cached langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Using cached langchain-0.3.0-py3-none-any.whl (1.0 MB)
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.3.13
    Uninstalling langchain-0.3.13:
      Successfully uninstalled langchain-0.3.13
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-community 0.3.13 requires langchain<0.4.0,>=0.3.13, but you have langchain 0.3.0 which is incompatible.[0m[31m
[0mSuccessfully installed langchain-0.3.0


In [17]:
import pandas as pd
from google.colab import files

# Upload the dataset
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name, encoding="ISO-8859-1")

# Inspect the dataset
print(data.head())


Saving analyticsvidhya_courses_full_simulated.csv to analyticsvidhya_courses_full_simulated (3).csv
                                               title  \
0           Frameworks for Effective Problem Solving   
1           Anyone can Build AI Agents - Free Course   
2  A Comprehensive Learning Path to Become a Data...   
3  Reimagining GenAI: Common Mistakes and Best Pr...   
4  Coding a ChatGPT-style Language Model from Scr...   

                                         description price  \
0  Master the art of structured thinking to tackl...  Free   
1  Unlock the power of AI without any programming...  Free   
2  Where do I begin? Data Analyst is such a huge ...  Free   
3  Generative AI is transforming industries, but ...  Free   
4  Master the art of building a ChatGPT-style lan...  Free   

                                          curriculum  \
0   Introduction, Problem Identification, Frameworks   
1                                       A for Agents   
2  Overview of the Lea

In [18]:
data['description'] = data['description'].fillna('')
data['title'] = data['title'].fillna('')
data['combined_text'] = data['title'] + " " + data['description']


In [23]:
from pinecone import Pinecone, Index, ServerlessSpec
PINECONE_API_KEY = "pcsk_eqodE_F1mdBzWUBbN2xUTJccYYv9n5u5oqervyzAkqCdWRsmdqzZSfEb9Eu8CpUYpiL7V"
PINECONE_ENV = "us-west-1"

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "vidhyanalytics"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENV
        )
    )
index_host = pc.describe_index(index_name).host
index = Index(name=index_name, host=index_host, api_key=PINECONE_API_KEY)
print(f"Connected to Pinecone index: {index_name}")


Connected to Pinecone index: vidhyanalytics


In [33]:
import pandas as pd
from google.colab import files
print("Please upload the CSV file.")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name, encoding='ISO-8859-1')
data['description'] = data['description'].fillna('')
data['title'] = data['title'].fillna('')
data['combined_text'] = data['title'] + " " + data['description']

print(f"Dataset loaded successfully with {len(data)} rows.")


Please upload the CSV file.


Saving analyticsvidhya_courses_full_simulated.csv to analyticsvidhya_courses_full_simulated (4).csv
Dataset loaded successfully with 68 rows.


In [34]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
data['embedding'] = data['combined_text'].apply(lambda x: embedding_model.encode(x))


In [36]:
from pinecone import Pinecone, Index, ServerlessSpec
PINECONE_API_KEY = "pcsk_eqodE_F1mdBzWUBbN2xUTJccYYv9n5u5oqervyzAkqCdWRsmdqzZSfEb9Eu8CpUYpiL7V"
PINECONE_ENV = "us-east-1"

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "vidhyanalytics"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENV
        )
    )

index_host = pc.describe_index(index_name).host


index = Index(name=index_name, host=index_host, api_key=PINECONE_API_KEY)
print(f"Connected to Pinecone index: {index_name}")


Connected to Pinecone index: vidhyanalytics


In [76]:
import pandas as pd
from pinecone import Pinecone, Index, ServerlessSpec
from sentence_transformers import SentenceTransformer
import numpy as np
import langchain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as LangChainPinecone
from textblob import TextBlob
from google.colab import files

print("Please upload your CSV file:")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name, encoding='ISO-8859-1')
data['description'] = data['description'].fillna('')
data['title'] = data['title'].fillna('')
data['combined_text'] = data['title'] + " " + data['description']

PINECONE_API_KEY = "pcsk_eqodE_F1mdBzWUBbN2xUTJccYYv9n5u5oqervyzAkqCdWRsmdqzZSfEb9Eu8CpUYpiL7V"
PINECONE_ENV = "us-east-1"
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "vidhyanalytics1"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENV
        )
    )

index_host = pc.describe_index(index_name).host
index = Index(name=index_name, host=index_host, api_key=PINECONE_API_KEY)
print(f"Connected to Pinecone index: {index_name}")

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


def embed_texts(texts):
    return [embedding_model.embed_query(text) for text in texts]

pinecone_store = LangChainPinecone(
    index=index,
    embedding=embedding_model.embed_query,
    text_key="combined_text"
)

for i, text in enumerate(data['combined_text']):
    metadata = {"title": data['title'][i], "description": data['description'][i]}
    embedding = embedding_model.embed_query(text)
    pinecone_store.add_texts(texts=[text], metadatas=[metadata])

print(f"Successfully stored embeddings in Pinecone.")

def correct_spelling(query):
    corrected_query = str(TextBlob(query).correct())
    if corrected_query != query:
        print(f"Did you mean: '{corrected_query}'?")
    return corrected_query
import os
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

os.environ["OPENAI_API_KEY"] = "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"

retriever = pinecone_store.as_retriever()

llm = OpenAI(temperature=0)
prompt_template = PromptTemplate(
    template="Given the query: '{query}', retrieve the most relevant course information from the following documents:\n\n{documents}",
    input_variables=["query", "documents"]
)
retrieval_qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("Smart Search System Initialized.")

Please upload your CSV file:


Saving analyticsvidhya_courses_full_simulated.csv to analyticsvidhya_courses_full_simulated (30).csv
Connected to Pinecone index: vidhyanalytics1




Successfully stored embeddings in Pinecone.
Smart Search System Initialized.
