In [None]:
import pandas as pd
import Stemmer
from llama_index.core import VectorStoreIndex, Document, StorageContext
from llama_index.core.settings import Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.storage.docstore import SimpleDocumentStore


In [19]:
# Load the CSV file
df = pd.read_csv("../datasets/merged_emails.csv")
print(df.head())
print(df.shape)
print(df.columns)

                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to be a loser   
1  

In [16]:
# # Only use the first 1000 rows
# df = df.iloc[:1000]
# print(df.shape)

In [25]:
# Initialize HuggingFace embedding model and configure settings
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Configure global settings with the embedding model
Settings.embed_model = embed_model

# Convert emails to documents
documents = []
for i, row in df.iterrows():
    # ensure body is a string
    if not isinstance(row["body"], str) or not row["body"]:
        continue
    text = str(row["body"])
    metadata = {
        "email_id": i,
        "label": int(row.get("label", -1)),
    }

    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

# Create a docstore to store nodes
docstore = SimpleDocumentStore()

# Create storage context
storage_context = StorageContext.from_defaults(docstore=docstore)

# Create a node parser with a larger chunk size to accommodate metadata
splitter = SentenceSplitter(chunk_size=1024)  # Increased from default 1024

nodes = splitter.get_nodes_from_documents(documents)

# We can pass in the index, docstore, or list of nodes to create the retriever
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=2,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

bm25_retriever.persist("../bm25_retriever")


Finding newlines for mmindex:   0%|          | 0.00/419M [00:00<?, ?B/s]

In [18]:
# retrieve some scam email
retrieved_nodes = bm25_retriever.retrieve("Diet pills")
print("Retrieved nodes:")
for node in retrieved_nodes:
    print(node.get_text())
    print(node.metadata)
    print("-----")

Retrieved nodes:
A Diet Pill that Really Works!It’s called Hoodia Zombieii. It is a simple appetite suppressant that has been used for hundreds of years and we’ve got it here just for you.Lose weight FAST - click here!
{'email_id': 17132, 'label': 1}
-----
A Diet Pill that Really Works!ItвЂ™s called Hoodia Zombieii. It is a simple appetite suppressant that has been used for hundreds of years and weвЂ™ve got it here just for you.Lose weight FAST - click here!
{'email_id': 17644, 'label': 1}
-----


In [None]:

from typing import List


from llama_index.core.schema import NodeWithScore

from settings import RETRIEVER_PATH


loaded_bm25_retriever = BM25Retriever.from_persist_dir(RETRIEVER_PATH)

# retrieve some scam email
retrieved_nodes: List[NodeWithScore] = loaded_bm25_retriever.retrieve("Hoodia Zombieii")
print("Retrieved nodes:")
for node in retrieved_nodes:
    print(node.get_score())
    print(node.get_text()[:100].replace("\n", " "))
    print(node.metadata)
    print("-----")


Retrieved nodes:
52.11211013793945
A Diet Pill that Really Works!It’s called Hoodia Zombieii. It is a simple appetite suppressant that 
{'email_id': 111454, 'label': 1}
-----
51.18099594116211
A Diet Pill that Really Works!ItÐ²Ð‚â„¢s called Hoodia Zombieii. It is a simple appetite suppressant
{'email_id': 115793, 'label': 1}
-----
