In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!pip install faiss-cpu

In [None]:
import numpy as np
import pandas as pd
import re
import torch
import spacy
import transformers
from sentence_transformers import SentenceTransformer
import faiss

In [None]:
nlp = spacy.load("en_core_web_lg")

sentence = "Hello dear Jack I want to know did you buy an Iphone, which is an Apple product, or you have you old phone"
doc = nlp(sentence)

for token in doc.ents:
    print(token.text, token.label_)

Jack PERSON
Iphone ORG
Apple ORG


In [None]:
NER = transformers.pipeline('ner', grouped_entities = True)

In [None]:
for item in NER(sentence):
    print(item['word'], ": ", item['entity_group'])

Jack :  PER
I :  MISC
Apple :  ORG


## SenteceTransformers

Approach | Description | Goal
---------|-------------|-----
AutoTokenizer + AutoModel <br> → last_hidden_state.mean(dim=1) | You directly use a pretrained Transformer (e.g., BERT, RoBERTa) <br>to produce embeddings by averaging token representations. | Creates contextual embeddings without <br> extra training for sentence-level tasks.
SentenceTransformer | A model trained specifically to generate semantically meaningful <br> sentence embeddings using contrastive or triplet objectives. | Creates sentence-level semantic embeddings <br> optimized for similarity, clustering, retrieval.


> So if you want to capture semantice of a sentence `sentence_transformers.SentenceTransformer` is far better than using `AutoTokenizer --> AutoModel --> last_headen_state.mean(dim = 1)`

<br>

> `SentenceTransformer` models: `all-MiniLM-L6-v2`, `all-distilroberta-v1`, etc.

<br>

Sentence embeddings convert entire sentences or documents into vectors that capture semantic meaning:
- Semantic search
- Clustering documents
- Duplicate detection
- Retrieval-augmented generation (RAG)

<br>

multilingual models:
- `paraphrase-multilingual-MiniLM-L12-v2` (lighter)
- `paraphrase-multilingual-mpnet-base-v2`


## Search models:

```py
from huggingface_hub import list_models

# Find models for specific task
models = list_models(
    filter="text-classification",  # or "token-classification", "question-answering"
    library="transformers",
    limit=10
)


# Filter by SentenceTransformers library
sentence_models = list_models(
    filter = "sentence-similarity",  # Common task for sentence transformers
    # filter="sentence-transformers", # Search by specific architecture 
    library = "sentence-transformers",
    limit=10
)


for model in models:
    print(f"{model.model_id} | Downloads: {model.downloads}")

# Search by architecture
bert_models = list_models(filter="bert", library="transformers", limit=5)
for model in bert_models:
    print(model.model_id)

# Search by model name
models = list_models(
    search="bert",  # Search in model names
    library="transformers"
)

# Filter by tags
models = list_models(
    filter=("text-generation", "gpt2"),
    library="transformers"
)
```

- library="transformers"
- library="sentence-transformers"
- library="tokenizers"
- library="datasets"
- library="diffusers" 
- library="safetensors"
- library="all"

<br>

# Most Common Tasks:
- filter="text-classification"
- filter="token-classification"           # NER
- filter="text-generation"
- filter="question-answering" 
- filter="sentence-similarity"
- filter="fill-mask"
- filter="summarization"
- filter="translation"
- filter="text2text-generation"
- filter="image-classification"
- filter="automatic-speech-recognition"
- filter="audio-classification"

# Less Common Tasks:
filter="table-question-answering"
filter="document-question-answering"
filter="visual-question-answering"
filter="image-segmentation"
filter="object-detection"
filter="text-to-speech"
filter="text-to-audio"
filter="audio-to-audio"
filter="zero-shot-classification"
filter="zero-shot-image-classification"


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sentences = [
    "I like read book.",
    "books are amazing, since they improve your knowledge.",
    "He is a good driver, and know how to repaire his car.",
    "my brother bought a motorcycle last year, and crash into a bookstore first day",
    "most of the men love to spend time with their car."
]

embeddings = model.encode(sentences)
print(embeddings.shape)

(5, 384)


In [None]:
# Now I want to check similarity between all sentences
model.similarity(embeddings, embeddings)

tensor([[1.0000, 0.6411, 0.1553, 0.2553, 0.1783],
        [0.6411, 1.0000, 0.1675, 0.2908, 0.0510],
        [0.1553, 0.1675, 1.0000, 0.3187, 0.3545],
        [0.2553, 0.2908, 0.3187, 1.0000, 0.2843],
        [0.1783, 0.0510, 0.3545, 0.2843, 1.0000]])

In [None]:
# know let's check a sentece similarity with preview sentences
sentence = ["Today is a good day to drive fast"]
sentence_embedding = model.encode(sentence)
score = model.similarity(sentence_embedding, embeddings).reshape(-1,).numpy()
score_arg = np.argsort(score)[::-1]

for idx in score_arg[:2]:
    print(sentences[idx], ": ", score[idx])

He is a good driver, and know how to repaire his car. :  0.34426796
most of the men love to spend time with their car. :  0.31730774


## FAISS

Facebook AI Similarity Search allows fast nearest neighbor search on millions of embeddings.
<br>

Use cases:
- Recommendation systems
- Semantic search
- Vector databases (e.g., Pinecone, Chroma, Weaviate)

<br>

> [Reference](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/)

### IndexFlatL2
It measures the L2 (or Euclidean) distance between all given points between our query vector, and the vectors loaded into the index.
> It’s simple, very accurate, but not too fast.

<br>

<img width = 400 src = "https://www.pinecone.io/_next/image/?url=https%3A%2F%2Fcdn.sanity.io%2Fimages%2Fvr8gru94%2Fproduction%2Fea951a4be3acf9d379cc6f922be1468b37b7f9e5-1280x720.png&w=3840&q=75">

In [None]:
import faiss


checkpoint1 = 'all-MiniLM-L6-v2'
checkpoint2 = 'bert-base-nli-mean-tokens'

model = SentenceTransformer(checkpoint1)

sentences = [
    "I like read book.",
    "books are amazing, since they improve your knowledge.",
    "He is a good driver, and know how to repaire his car.",
    "my brother bought a motorcycle last year, and crash into a bookstore first day",
    "most of the men love to spend time with their car."
]

# create sentence embeddings
sentence_embeddings = model.encode(sentences)

# extract the embedding size
print("Embedding shape: ", sentence_embeddings.shape)
d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(d)

# Check whether an index is trained
print(f"Index is trained: {index.is_trained}")

# Load embeddings
index.add(sentence_embeddings)
print("total number of sentences",index.ntotal)

Embedding shape:  (5, 384)
Index is trained: True
total number of sentences 5


In [None]:
k = 4 # number of nearest neigbours
querry = model.encode(["Someone deriving fast and crash his car into a wall"]) # query


D, I = index.search(querry, k)  # search, I are indices
print(I)

[[2 3 4 0]]


In [None]:
score_arg = np.argsort(I[0])[::-1]

for idx in score_arg[:2]:
    print(sentences[idx])

He is a good driver, and know how to repaire his car.
books are amazing, since they improve your knowledge.


In [None]:
# extract the numerical vectors from Faiss.

# d: embedding size
# k: Number of neighbours
vecs = np.zeros((k, d))
# then iterate through each ID from I and add the reconstructed vector to our zero-array
for i, val in enumerate(I[0].tolist()):
    vecs[i, :] = index.reconstruct(val)


### Partitioning The Index

Using this method, we would take a query vector xq, identify the cell it belongs to, and then use `IndexFlatL2`

> now we have to train our index on our data — which we must do before adding any data to the index.

<br>

<img width=400 src="https://www.pinecone.io/_next/image/?url=https%3A%2F%2Fcdn.sanity.io%2Fimages%2Fvr8gru94%2Fproduction%2Fca1ed9b80fd0788cee513ef75c1b8bd8daad8571-1400x748.png&w=3840&q=75">

In [None]:
nlist = 2  # Number of partitions (Voronoi cells)
quantizer = faiss.IndexFlatL2(d) # using the L2 index as a quantizer step
index = faiss.IndexIVFFlat(quantizer, d, nlist)

print(f"Index is trained: {index.is_trained}")
index.train(sentence_embeddings)
print(f"Index is trained: {index.is_trained}")

index.add(sentence_embeddings)
print("total number of sentences",index.ntotal)  # number of embeddings indexed
print()


# Number of  nearby cells to search. increase accuracy
index.nprobe = 10
D, I = index.search(querry, k)  # search


score_arg = np.argsort(I[0])[::-1]

for idx in score_arg[:2]:
    print(sentences[idx])

index.make_direct_map()

# First create direct mappings,
#   since there is no direct mapping between the original vectors and their index position
vecs = np.zeros((k, d))
for i, val in enumerate(I[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

Index is trained: False
Index is trained: True
total number of sentences 5

He is a good driver, and know how to repaire his car.
books are amazing, since they improve your knowledge.


### Quantization

```python
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

index.train(sentence_embeddings)
```

## Similarity search project

extract reviews from [kaggle dataset](https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews)

```sh
kaggle datasets download arhamrumi/amazon-product-reviews
```

In [None]:
!mkdir ~/.kaggle
!cp ./kaggle.json  ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download arhamrumi/amazon-product-reviews
!7z x /content/amazon-product-reviews.zip

In [None]:
pd.set_option("display.max_columns", None)
df = pd.read_csv("/content/Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
reviews = df['Text'].values
reviews.shape

(568454,)

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Available devie: ", device)
checkpoint = "all-MiniLM-L6-v2"
model = SentenceTransformer(checkpoint).to(device)

model.eval()
with torch.no_grad():
    embedding = model.encode(reviews[:10000], batch_size = 32, show_progress_bar = True) # convert_to_numpy=True     For FAISS compatibility
print(f"Embedding shape: {embedding.shape}")

Available devie:  cuda


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Embedding shape: (10000, 384)


In [None]:
querry = model.encode(["I love this device, it has a greate battery, nice design, and powerful cpu."])
similarities = model.similarity(querry, embedding)

In [None]:
idx = np.argsort(similarities.numpy())[::-1]

for i in idx[:2]:
    print(reviews[i])

['The individual Pocky sticks are not separately packaged and melt and stick together while being shipped.'
 'I bought these and yes they do come in plastic bags with a label on each but also included was a five sided box that you can fold together that has a picture of each flavor, one on each side. It makes a cute little colorful box, nice to give as a gift if you put it together.'
 'These Piquillos are packaged in a thin plastic package, though I thought from the picture that they were packed in a roll-top tin (like sardines are packed in).The peppers that survived the shipping were very good, lightly roasted and with good tangy flavor. However, one of the packages was damaged in shipment and leaked red oil all over the rest of the packs, making a huge mess.'
 ...
 "Great audio, until a few months in it just decides to stop working. Common problem: google it and you'll see. Cheapest construction of any product every designed in the entire world. Feels like a fast food toy.<br /><br 

In [None]:
d = embedding.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embedding)

D, I = index.search(querry, 5)

idx = np.argsort(I[0])[::-1]
for i in idx[:2]:
    print(reviews[i])
    print()

Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".

Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.



In [None]:
n = 50
d = embedding.shape[1]
quantize = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantize, d, n)

index.train(embedding)
index.add(embedding)

index.nprobe = 10
D, I = index.search(querry, 5)

idx = np.argsort(I[0])[::-1]
for i in idx[:2]:
    print(reviews[i])
    print()

Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".

Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.



### ChromaDB

In [None]:
import chromadb

chroma_client = chromadb.Client()
# collection = chroma_client.create_collection(name = "reviews")
collection = chroma_client.create_collection(
    name = "reviews",
    metadata = {"hnsw:space": "cosine"}  # <--- sets the search method "cosine" "l2" "ip"
)

collection.add(
    documents = sentences,
    embeddings = sentence_embeddings.tolist(),
    ids=[f"id{i}" for i in range(len(sentences))]
)

query = "I love this device, it has a greate battery, nice design, and powerful cpu."
query_emb = model.encode([query]).tolist()

results = collection.query(query_embeddings = query_emb, n_results = 2)
print(results["documents"])

## Classification project

In [None]:
# convert to only two lables
label = (df["Score"].values > 3).astype(int)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(embedding[:10000], label[:10000], test_size = 0.2, random_state = 42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.20      0.33       492
           1       0.79      0.99      0.88      1508

    accuracy                           0.80      2000
   macro avg       0.84      0.60      0.60      2000
weighted avg       0.82      0.80      0.74      2000



## FAISS in LangChain

```python
# Similarity search with score
docs_scores = vector_store.similarity_search_with_score("query", k=5)

# Max marginal relevance (diversity)
docs = vector_store.max_marginal_relevance_search("query", k=5)

# Similarity search by vector
docs = vector_store.similarity_search_by_vector(query_embedding, k=5)
```

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.docstore.in_memory import InMemoryDocstore

embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
# index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

# vector_store = FAISS(
#     embedding_function=embeddings,
#     index=index,
#     docstore=InMemoryDocstore(),
#     index_to_docstore_id={},
# )

# when we use `from_texts` method, it creates a FAISS `index` under the hood
# Automatically handles text-embedding mapping
vector_store = FAISS.from_texts(
    texts = reviews,
    embedding = embeddings,
    # metadatas=[{"source": f"doc_{i}"} for i in range(len(reviews))]  # Optional metadata
)

# Search returns documents with content and metadata
docs = vector_store.similarity_search("your query", k = 5)
for doc in docs:
    print(doc.page_content)
    # print(doc.metadata)  # If you added metadata