In [1]:
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import CSVLoader

# 1. Load CSV and create Documents (one per row)
loader = CSVLoader(file_path="contacts.csv")
documents = loader.load()

print(documents[0].page_content)  # show structure of first row

# 2. Initialize the embedding model (using a sentence-transformer model)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # You can change the model

# 3. Generate embeddings for all documents (rows)
doc_embeddings = embedding_model.encode([doc.page_content for doc in documents], convert_to_tensor=True)

# Print shape of embeddings to verify
print(doc_embeddings.shape)


ModuleNotFoundError: No module named 'sentence_transformers'

In [3]:
from sentence_transformers import SentenceTransformer

# Load a small and efficient embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Sample Sentences
sentences = ["I love AI!", "Machine Learning is amazing!", "Python is great for NLP."]

# Generate Sentence Embeddings
embeddings = model.encode(sentences)

# Print First Sentence's Embedding (Vector Representation)
print(embeddings[0])  # Output: A high-dimensional vector


1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[-3.05859670e-02 -6.20776303e-02  3.73073407e-02 -3.40212919e-02
  4.56047952e-02 -1.10021606e-02  4.79980297e-02  3.85053828e-03
  2.72037536e-02  3.65901329e-02 -6.88407049e-02 -1.44740632e-02
  3.44443172e-02  5.23359235e-03 -3.55791785e-02  3.25523466e-02
 -5.26585579e-02  3.39833496e-04 -8.66008699e-02 -5.13887592e-02
 -7.31118545e-02 -3.60318925e-03  2.12839711e-02 -1.62747279e-02
 -2.52623763e-02  1.12270378e-01  9.08540934e-03 -7.46146962e-02
  3.94907594e-03 -1.09529473e-01  1.35777146e-02  4.06444855e-02
  5.10792807e-02  2.52673030e-02 -9.23020095e-02  3.68888862e-02
 -3.97319496e-02 -4.79117893e-02  4.90372851e-02 -1.05650127e-02
 -2.14588661e-02  1.32791912e-02  3.24045941e-02 -5.72485588e-02
  3.69078554e-02  8.12482238e-02 -4.68188263e-02 -3.16513963e-02
  1.14586383e-01  1.03629142e-01 -7.94033110e-02 -2.91160233e-02
  3.49520636e-03 -2.08327137e-02 -1.51492432e-02  2.65067443e-02
  4.02097292e-02 -3.03349346e-02 -4.07608971e-02 -4.21046838e-02
  3.35972980e-02  2.89844

In [4]:
print(len(embeddings))

3


In [5]:
print(len(embeddings[0]))

384


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(embeddings)

# Print Similarity Scores
print(similarity_matrix)

[[1.0000001  0.53586495 0.25024596]
 [0.53586495 1.0000004  0.34046715]
 [0.25024596 0.34046715 1.0000001 ]]


In [8]:
def find_most_similar(query, corpus):
    query_embedding = model.encode([query])
    corpus_embeddings = model.encode(corpus)

    # Compute similarity scores
    scores = cosine_similarity(query_embedding, corpus_embeddings)[0]

    # Find the most similar sentence
    most_similar_idx = scores.argmax()
    return corpus[most_similar_idx], scores[most_similar_idx]

# Example
corpus = ["I love AI.", "I enjoy playing football.", "NLP is fascinating.", "Python is my favorite language."]
query = "I like artificial intelligence."

most_similar_sentence, similarity_score = find_most_similar(query, corpus)
print(f"Most similar: {most_similar_sentence} (Score: {similarity_score:.4f})")


Most similar: I love AI. (Score: 0.8122)


In [10]:
from langchain.document_loaders import CSVLoader
from sentence_transformers import SentenceTransformer

# 1️⃣ Load CSV and create Documents (one per row)
loader = CSVLoader(file_path="customers-100.csv")
documents = loader.load()

print(documents[0].page_content)  # Show structure of first row

# 2️⃣ Initialize SentenceTransformer model (MiniLM for efficiency)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # You can change the model

# 3️⃣ Generate embeddings for all documents (rows)
doc_embeddings = embedding_model.encode(
    [doc.page_content for doc in documents], 
    convert_to_numpy=True  # Convert to NumPy array for easier FAISS usage
)

# Print shape of embeddings to verify
print(f"Embeddings shape: {doc_embeddings.shape}")  # Output: (num_documents, 384)


Index: 1
Customer Id: DD37Cf93aecA6Dc
First Name: Sheryl
Last Name: Baxter
Company: Rasmussen Group
City: East Leonard
Country: Chile
Phone 1: 229.077.5154
Phone 2: 397.884.0519x718
Email: zunigavanessa@smith.info
Subscription Date: 2020-08-24
Website: http://www.stephenson.com/
Embeddings shape: (100, 384)


In [12]:
import pandas as pd  
toy_data = pd.read_csv("customers-100.csv")

In [13]:
toy_data.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
3,4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
4,5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/


In [14]:
toy_data.columns

Index(['Index', 'Customer Id', 'First Name', 'Last Name', 'Company', 'City',
       'Country', 'Phone 1', 'Phone 2', 'Email', 'Subscription Date',
       'Website'],
      dtype='object')

In [33]:
from langchain.document_loaders import CSVLoader
import pandas as pd
from langchain.schema import Document

def custom_csv_loader(file_path):
    """
    Custom function to load a CSV file, format each row into structured text, 
    and return a list of LangChain Document objects with relevant metadata.

    :param file_path: Path to the CSV file
    :return: List of LangChain Document objects
    """
    # Load CSV into Pandas DataFrame
    df = pd.read_csv(file_path)
    
    documents = []
    
    for index, row in df.iterrows():
        # Convert row into structured text format
        text_representation = f"""
        Customer ID: {row['Customer Id']}
        Name: {row['First Name']} {row['Last Name']}
        Company: {row['Company']}
        City: {row['City']}
        Country: {row['Country']}
        Phone 1: {row['Phone 1']}
        Phone 2: {row['Phone 2']}
        Email: {row['Email']}
        Subscription Date: {row['Subscription Date']}
        Website: {row['Website']}
        """
        
        # Metadata excluding file path but adding row number and Customer ID
        metadata = {
            "row_number": index + 1,
            "customer_id": row["Customer Id"],
            "city": row["City"],
            "country": row["Country"]
        }

        # Create a Document object
        document = Document(page_content=text_representation.strip(), metadata=metadata)
        documents.append(document)
    
    return documents

# Example usage
file_path = "./customers-100.csv"
documents = custom_csv_loader(file_path)

# Display first document for verification
print(documents[0])


page_content='Customer ID: DD37Cf93aecA6Dc
        Name: Sheryl Baxter
        Company: Rasmussen Group
        City: East Leonard
        Country: Chile
        Phone 1: 229.077.5154
        Phone 2: 397.884.0519x718
        Email: zunigavanessa@smith.info
        Subscription Date: 2020-08-24
        Website: http://www.stephenson.com/' metadata={'row_number': 1, 'customer_id': 'DD37Cf93aecA6Dc', 'city': 'East Leonard', 'country': 'Chile'}


In [34]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # You can change the model

# 3️⃣ Generate embeddings for all documents (rows)
# doc_embeddings = embedding_model.encode(
#     [doc.page_content for doc in documents], 
#     convert_to_numpy=True  # Convert to NumPy array for easier FAISS usage
# )

# # Print shape of embeddings to verify
# print(f"Embeddings shape: {doc_embeddings.shape}")  # Output: (num_documents, 384)

In [35]:
# from langchain.vectorstores import FAISS

In [36]:
# Store embeddings in FAISS
# vector_store = FAISS.from_documents(documents, embedding_model)

# # Save the FAISS index for later use
# vector_store.save_local("./faiss_index")

In [37]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize HuggingFace embeddings (uses a model similar to Sentence Transformers)
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

# Store embeddings in FAISS for efficient retrieval
vector_store = FAISS.from_documents(documents, embedding_model)

# Save FAISS index for later use
vector_store.save_local("./faiss_index")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [42]:
# Load FAISS index
vector_store = FAISS.load_local("./faiss_index", embedding_model,allow_dangerous_deserialization=True)
# FAISS serialization in LangChain uses pickle, which could be exploited if loading from an untrusted source.

# Query example
query = "Where is Sheryl from?"
retrieved_docs = vector_store.similarity_search(query, k=5)  # Retrieve top-3 matches

# Display retrieved results
for doc in retrieved_docs:
    print("Retrieved Document:")
    print(doc.page_content)
    print("Metadata:", doc.metadata)
    print("-" * 50)


Retrieved Document:
Customer ID: DD37Cf93aecA6Dc
        Name: Sheryl Baxter
        Company: Rasmussen Group
        City: East Leonard
        Country: Chile
        Phone 1: 229.077.5154
        Phone 2: 397.884.0519x718
        Email: zunigavanessa@smith.info
        Subscription Date: 2020-08-24
        Website: http://www.stephenson.com/
Metadata: {'row_number': 1, 'customer_id': 'DD37Cf93aecA6Dc', 'city': 'East Leonard', 'country': 'Chile'}
--------------------------------------------------
Retrieved Document:
Customer ID: C2dE4dEEc489ae0
        Name: Sheryl Meyers
        Company: Browning-Simon
        City: Robersonstad
        Country: Cyprus
        Phone 1: 854-138-4911x5772
        Phone 2: +1-448-910-2276x729
        Email: mariokhan@ryan-pope.org
        Subscription Date: 2020-01-13
        Website: https://www.bullock.net/
Metadata: {'row_number': 9, 'customer_id': 'C2dE4dEEc489ae0', 'city': 'Robersonstad', 'country': 'Cyprus'}
---------------------------------------