In [1]:
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
import pandas as pd
import os

ModuleNotFoundError: No module named 'langchain_huggingface'

In [2]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import torch
import re
from tqdm import tqdm

In [3]:
import os
# Define paths (local to Colab or your environment)
BROKERS_CSV_PATH = '/content/brokers_list.csv'
OUTPUT_DIR = '/content/broker_vector_db'

# Create the output directory if it doesn’t exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Step 1: Load the CSV file and inspect it
df = pd.read_csv(BROKERS_CSV_PATH)
print("First 5 rows of the dataset:")
print(df.head(5))

First 5 rows of the dataset:
   S.N Broker No                                        Broker Name  \
0    1         1                     Kumari Securities Pvt. Limited   
1    2     1_RWS                     Kumari Securities Pvt. Limited   
2    3         3                       Arun Securities Pvt. Limited   
3    4         4  Stock Broker Opal Securities Investment Pvt. L...   
4    5         5    Market Securities Exchange Company Pvt. Limited   

                                           Address  Phone  \
0                           Dillibazaar, Kathmandu    NaN   
1                                New Road, Pokhara    NaN   
2                              Gaushala, Kathmandu    NaN   
3                              Lazimpat, Kathmandu    NaN   
4  OM Dev Plaza Complex, Kichha Pokhari, Kathmandu    NaN   

                                  Website                                  TMS  
0           https://kumarisecurities.com/       https://tms01.nepsetms.com.np/  
1           htt

In [4]:
# Step 2: Check for missing values and fill them
print("\nMissing values before filling:")
print(df.isnull().sum())
df.fillna("Not Available", inplace=True)
print("\nMissing values after filling:")
print(df.isnull().sum())

# Step 3: Standardize Broker Names (remove extra spaces, standardize "Pvt. Limited")
df["Broker Name"] = df["Broker Name"].apply(
    lambda x: re.sub(r'\s+', ' ', x.strip())  # Remove extra spaces
)
df["Broker Name"] = df["Broker Name"].str.replace(
    r'Pvt\.?\s*Limited|Pvt\.Ltd|Pvt\s*Limited|Pvt\s*\.\s*Limited|Pvt\s*\.\s*Ltd',
    'Pvt. Limited',
    regex=True
)

# Convert DataFrame to list of dictionaries
brokers = df.to_dict('records')
print("\nSample preprocessed broker entry:")
print(brokers[0])

# Step 4: Convert Broker Data to Text Chunks
chunks = []
metadata = []

for broker in tqdm(brokers, desc="Creating Broker Chunks"):
    # Build the text chunk, excluding "Not Available" fields
    chunk_parts = [f"Broker {broker['Broker Name']} (Broker No: {broker['Broker No']})"]

    if broker["Address"] != "Not Available":
        chunk_parts.append(f"located at {broker['Address']}")
    if broker["Phone"] != "Not Available":
        chunk_parts.append(f"contact phone {broker['Phone']}")
    if broker["Website"] != "Not Available":
        chunk_parts.append(f"website {broker['Website']}")
    if broker["TMS"] != "Not Available":
        chunk_parts.append(f"TMS link {broker['TMS']}")

    # Join parts into a single sentence
    chunk = ", ".join(chunk_parts) + "."
    chunks.append(chunk)
    metadata.append({
        "source": BROKERS_CSV_PATH,
        "broker_no": broker["Broker No"],
        "broker_name": broker["Broker Name"],
        "address": broker["Address"],
        "website": broker["Website"],
        "tms": broker["TMS"],
        "row_index": broker["S.N"] - 1  # Adjust for 0-based indexing
    })  # Slimmed-down metadata with key fields

print("\nSample chunk content:")
print(chunks[0])
print("\nCorresponding metadata:")
print(metadata[0])

  df.fillna("Not Available", inplace=True)



Missing values before filling:
S.N              0
Broker No        0
Broker Name      0
Address          0
Phone          153
Website         11
TMS              4
dtype: int64

Missing values after filling:
S.N            0
Broker No      0
Broker Name    0
Address        0
Phone          0
Website        0
TMS            0
dtype: int64

Sample preprocessed broker entry:
{'S.N': 1, 'Broker No': '1', 'Broker Name': 'Kumari Securities Pvt. Limited', 'Address': 'Dillibazaar, Kathmandu', 'Phone': 'Not Available', 'Website': 'https://kumarisecurities.com/', 'TMS': 'https://tms01.nepsetms.com.np/'}


Creating Broker Chunks: 100%|██████████| 153/153 [00:00<00:00, 207477.70it/s]


Sample chunk content:
Broker Kumari Securities Pvt. Limited (Broker No: 1), located at Dillibazaar, Kathmandu, website https://kumarisecurities.com/, TMS link https://tms01.nepsetms.com.np/.

Corresponding metadata:
{'source': '/content/brokers_list.csv', 'broker_no': '1', 'broker_name': 'Kumari Securities Pvt. Limited', 'address': 'Dillibazaar, Kathmandu', 'website': 'https://kumarisecurities.com/', 'tms': 'https://tms01.nepsetms.com.np/', 'row_index': 0}





In [5]:
# Step 5: Set Up Embeddings Model
sentence_transformer = HuggingFaceEmbeddings(
    model_name='sentence-transformers/paraphrase-MiniLM-L3-v2',
    model_kwargs={'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')}
)

# Step 6: Convert Chunks to Document Objects
documents = [
    Document(
        page_content=chunk,
        metadata=meta
    )
    for chunk, meta in zip(chunks, metadata)
]

  sentence_transformer = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Step 7: Create and Save FAISS Vector Store
print("\nCreating FAISS vector store...")
vector_db = FAISS.from_documents(documents, sentence_transformer)
vector_db.save_local(OUTPUT_DIR)
print(f"Broker data has been successfully converted to a FAISS vector store and saved at '{OUTPUT_DIR}'.")


Creating FAISS vector store...
Broker data has been successfully converted to a FAISS vector store and saved at '/content/broker_vector_db'.


In [10]:
import shutil

In [12]:
from google.colab import files
ZIP_FILE = '/content/broker_vector_db.zip'
# Step 7: Create and Save FAISS Vector Store
print("\nCreating FAISS vector store...")
vector_db = FAISS.from_documents(documents, sentence_transformer)
vector_db.save_local(OUTPUT_DIR)
print(f"Fundamental data has been successfully converted to a FAISS vector store and saved at '{OUTPUT_DIR}'.")

# Step 8: Zip the Vector Store Directory
print("\nZipping the broker_vector_db folder...")
shutil.make_archive('/content/broker_vector_db', 'zip', OUTPUT_DIR)
print(f"Created zip file at '{ZIP_FILE}'.")

# Step 9: Download the Zipped File
print("\nDownloading the zipped broker_vector_db...")
files.download(ZIP_FILE)





Creating FAISS vector store...
Fundamental data has been successfully converted to a FAISS vector store and saved at '/content/broker_vector_db'.

Zipping the broker_vector_db folder...
Created zip file at '/content/broker_vector_db.zip'.

Downloading the zipped broker_vector_db...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# Step 8: Load and Test the Vector Store
docsearch = FAISS.load_local(
    OUTPUT_DIR,
    sentence_transformer,
    allow_dangerous_deserialization=True
)

# Example similarity search
query = "Which brokers are in Kathmandu?"
results = docsearch.similarity_search(query, k=5)

# Display results
print("\nSearch results for query: 'Which brokers are in Kathmandu?'")
for result in results:
    print(f"Broker: {result.metadata['broker_name']} (Broker No: {result.metadata['broker_no']})")
    print(f"Details: {result.page_content}")
    print(f"Source: {result.metadata['source']}, Row Index: {result.metadata['row_index']}")
    print("-" * 50)


Search results for query: 'Which brokers are in Kathmandu?'
Broker: Prabhu Stock Market Limited (Broker No: 99)
Details: Broker Prabhu Stock Market Limited (Broker No: 99), located at Kathmandu.
Source: /content/brokers_list.csv, Row Index: 150
--------------------------------------------------
Broker: Sunrise Securities Limited (Broker No: 100)
Details: Broker Sunrise Securities Limited (Broker No: 100), located at Kathmandu.
Source: /content/brokers_list.csv, Row Index: 151
--------------------------------------------------
Broker: R.B.B. Securities Company Ltd (Broker No: 97)
Details: Broker R.B.B. Securities Company Ltd (Broker No: 97), located at Kathmandu, TMS link https://tms98.trademow.com/login.
Source: /content/brokers_list.csv, Row Index: 148
--------------------------------------------------
Broker: Money World Share Exchange PVT Ltd (Broker No: 73)
Details: Broker Money World Share Exchange PVT Ltd (Broker No: 73), located at Kathmandu, TMS link https://tms74.nepsetms.com

In [14]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")

# Check the dimension
embedding_vector = embedding_model.embed_query("test")
print("Embedding dimension:", len(embedding_vector))



Embedding dimension: 384
