In [5]:
# BELOW IS A SMALL SCRIPT THAT I WROTE TO RANK EMAILS IN AN INBOX GIVEN A USER QUERY [USING BM25 + SEMANTIC SIMILARTY]
# HERE THE INBOX CONSISTS OF 100 EMAILS AND WAS PREPARED WITH PYTHON
# RUNNING THIS CODE WOULD SAVE EACH OF THE EMAIL OF THE FAKE INBOX IN A .csv file.
# THE EMAILS ARE THEN EXTRACTED FROM THE CSV AND A MIX OF BM25 AND SEMANTIC SIMILARTY IS USED TO GENERATE SCORES AND RANK EMAILS.

import random
import string
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np

## CONSTANTS
NO_OF_EMAILS = 100
CATEGORIES = ["work", "promo", "newsletter", "social", "updates"]
ACTIONS = ["reply", "archive", "flag", "snooze"]
SENDERS = {
    "work": ["alice@company.com", "bob@company.com", "manager@company.com"],
    "promo": ["sale@shopnow.com", "deals@onlinesite.com"],
    "newsletter": ["newsletter@substack.com", "daily@news.com"],
    "social": ["noreply@linkedin.com", "alerts@facebook.com"],
    "updates": ["system@github.com", "jira@company.com"]
}

TEMPLATES = {
    "work": ["Project update", "Weekly sync", "Urgent deadline", "Client feedback"],
    "promo": ["50% off!", "Limited time deal", "Your coupon is waiting"],
    "newsletter": ["Top stories today", "This week's digest", "Editor's pick"],
    "social": ["You have new followers", "New message received"],
    "updates": ["PR #42 merged", "Ticket assigned", "Build failed"]
}

BODIES = {
    "work": "Please find the update on the project...",
    "promo": "Don't miss out on this offer...",
    "newsletter": "Here are your news highlights...",
    "social": "Someone liked your post...",
    "updates": "System generated update. No action needed."
}

## creating a fucntion to calculate when the email was sent
def random_date():
    days_ago = random.randint(0, 90)
    date = datetime.now() - timedelta(days=days_ago)
    return date.strftime("%Y-%m-%d %H:%M:%S")

## creating a function to simulate the user behavior
def simulation(category):
    if category == "work":
        label = random.choices(["opened", "starred", "ignored"], [0.6, 0.3, 0.1])[0]
        action = random.choices(["reply", "flag", "archive"], [0.6, 0.3, 0.1])[0]
    elif category == "promo":
        label = random.choices(["ignored", "opened"], [0.8, 0.2])[0]
        action = "archive"
    elif category == "newsletter":
        label = random.choices(["ignored", "opened"], [0.6, 0.4])[0]
        action = random.choice(["archive", "snooze"])
    elif category == "social":
        label = random.choices(["opened", "ignored"], [0.5, 0.5])[0]
        action = "archive"
    else:
        label = random.choice(["opened", "ignored"])
        action = random.choice(ACTIONS)
    return [label, action]

## finally, generating the email dictionary for each simulation and putting it in the emails list
emails = []
for i in range(NO_OF_EMAILS):
    category = random.choice(CATEGORIES)
    subject = random.choice(TEMPLATES[category])
    sender = random.choice(SENDERS[category])
    timestamp = random_date()
    body = BODIES[category]
    results = simulation(category)

    label = results[0]
    action = results[1]


    email = {
        "id": i,
        "category": category,
        "subject": subject,
        "sender": sender,
        "timestamp": timestamp,
        "body": body,
        "label": label,
        "action": action
    }
    emails.append(email)

## creating a pandas dataframe and converting it into csv to store the data.
df = pd.DataFrame(emails)
df.to_csv("mock_emails.csv", index=False)

print(f"Generated {NO_OF_EMAILS} mock emails.")

df = pd.read_csv("mock_emails.csv")
documents = (df["subject"] + " " + df["body"]).tolist()

## tokenization using .split()
tokenized_docs = [doc.lower().split() for doc in documents]

## using bm25 to retreive information
bm25 = BM25Okapi(tokenized_docs)

## creating a fucntion to get bm25 scores
def get_bm25_scores(query):
    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)
    return scores

## model to embed similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, convert_to_tensor=True)

def get_embedding_scores(query):
    query_embedding = model.encode([query], convert_to_tensor=True)
    sim_scores = cosine_similarity(query_embedding, doc_embeddings)[0]
    return sim_scores

def hybrid_rank(query, alpha=0.5, top_n=10):
    bm25_scores = get_bm25_scores(query)
    emb_scores = get_embedding_scores(query)

    # normalizing the scores obtained
    scaler = MinMaxScaler()
    bm25_scaled = scaler.fit_transform(np.array(bm25_scores).reshape(-1, 1)).flatten()
    emb_scaled = scaler.fit_transform(np.array(emb_scores).reshape(-1, 1)).flatten()

    final_scores = alpha * bm25_scaled + (1 - alpha) * emb_scaled
    ranked_indices = np.argsort(final_scores)[::-1][:top_n]

    print(f"\n Top {top_n} Emails for Query: '{query}'\n")
    for idx in ranked_indices:
        print(f"[Score: {final_scores[idx]:.3f}] : {df.loc[idx, 'subject']} | From: {df.loc[idx, 'sender']} | Label: {df.loc[idx, 'label']}")
    return ranked_indices

# example usage
query = "project update"
hybrid_rank(query)


Generated 100 mock emails.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


 Top 10 Emails for Query: 'project update'

[Score: 1.000] : Project update | From: manager@company.com | Label: opened
[Score: 1.000] : Project update | From: bob@company.com | Label: opened
[Score: 1.000] : Project update | From: manager@company.com | Label: starred
[Score: 1.000] : Project update | From: alice@company.com | Label: opened
[Score: 1.000] : Project update | From: alice@company.com | Label: starred
[Score: 0.491] : Urgent deadline | From: alice@company.com | Label: opened
[Score: 0.491] : Urgent deadline | From: alice@company.com | Label: starred
[Score: 0.491] : Urgent deadline | From: bob@company.com | Label: starred
[Score: 0.491] : Urgent deadline | From: manager@company.com | Label: opened
[Score: 0.491] : Urgent deadline | From: bob@company.com | Label: opened


array([87, 82,  2, 44, 12, 74,  0, 37, 54, 40])