### **Save Outlook Email Data all in one JSON File**

In [2]:
import win32com.client
import json
import re
import unicodedata
from bs4 import BeautifulSoup

# Function to clean email body
def clean_body(body):
    # Check if the body contains HTML
    if "<html" in body.lower():
        soup = BeautifulSoup(body, "html.parser")
        text = soup.get_text(separator=" ")  # Extract visible text
    else:
        text = body  # Plain text email

    # Remove excessive whitespace, newlines, and tabs
    text = re.sub(r"[\r\n\t]+", " ", text).strip()

    # Remove links (URLs)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # Normalize Unicode to closest English equivalent
    text = unicodedata.normalize("NFKD", text)

    # Remove non-ASCII characters (keep only English characters)
    text = text.encode("ascii", "ignore").decode("ascii")

    return text

# Connect to Outlook
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

# Select the Inbox folder
inbox = outlook.GetDefaultFolder(6)  # 6 = Inbox

# Fetch emails and sort them by received time (most recent first)
messages = inbox.Items
messages.Sort("[ReceivedTime]", True)  # Sort by newest first

# List to store email data
email_data = []

# Iterate through the first 10 emails
for _, message in enumerate(messages):
    try:
        email_info = {
            "Subject": clean_body(message.Subject),  # Normalize subject
            "Sender": clean_body(message.SenderName),  # Normalize sender name
            "Received": str(message.ReceivedTime),
            "Body": clean_body(message.Body)  # Clean, normalize, and remove links
        }
        email_data.append(email_info)
    except Exception as e:
        print("Error processing email:", e)

# Save to JSON file
json_filename = "outlook_emails.json"
with open(json_filename, "w", encoding="utf-8") as json_file:
    json.dump(email_data, json_file, indent=4)

print(f"Saved the first {len(email_data)} emails to {json_filename}")


Saved the first 8519 emails to outlook_emails.json


### **Save Outlook Email Data across various JSON Files**

In [1]:
import win32com.client
import json
import re
import unicodedata
import os
from bs4 import BeautifulSoup

# Create the directory if it doesn't exist
output_dir = "jsonData"
os.makedirs(output_dir, exist_ok=True)

# Function to clean email body
def clean_body(body):
    if not body:
        return ""  # Handle empty bodies safely

    # Check if the body contains HTML
    if "<html" in body.lower():
        soup = BeautifulSoup(body, "html.parser")
        text = soup.get_text(separator=" ")  # Extract visible text
    else:
        text = body  # Plain text email

    # Remove excessive whitespace, newlines, and tabs
    text = re.sub(r"[\r\n\t]+", " ", text).strip()
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)

    # Remove links (URLs)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # Normalize Unicode to closest English equivalent
    text = unicodedata.normalize("NFKD", text)

    # Remove non-ASCII characters (keep only English characters)
    text = text.encode("ascii", "ignore").decode("ascii")

    # Remove `<` symbol
    text = text.replace("<", "")

    return text

# Connect to Outlook
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

# Select the Inbox folder
inbox = outlook.GetDefaultFolder(6)  # 6 = Inbox

# Get ALL emails (without restrictions)
messages = inbox.Items
messages.IncludeRecurrences = True  

# Use GetFirst() and GetNext() to manually iterate through ALL emails
email_list = []
message = messages.GetFirst()
while message:
    email_list.append(message)
    message = messages.GetNext()  # Move to the next email

# Sort manually by received time (newest first)
email_list.sort(key=lambda m: m.ReceivedTime, reverse=True)

# Process emails
for i, message in enumerate(email_list):
    try:
        email_info = {
            "Subject": clean_body(message.Subject),
            "Sender": clean_body(message.SenderName),
            "Received": str(message.ReceivedTime),
            "Body": clean_body(message.Body)
        }

        # Save each email as a separate JSON file
        json_filename = os.path.join(output_dir, f"email_{i+1}.json")
        with open(json_filename, "w", encoding="utf-8") as json_file:
            json.dump(email_info, json_file, indent=4)

    except Exception as e:
        print(f"Error processing email {i+1}: {e}")

print(f"✅ Successfully saved {len(email_list)} emails to the '{output_dir}' folder.")


✅ Successfully saved 8519 emails to the 'jsonData' folder.


### **Embed the Data into 768-dim vectors (Gmail)**

In [1]:
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import os

# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)

# Load the CSV file
csv_file = "C:\\Users\\kingd\\GitHub\\School\\Data\\gmail_emails.csv"  # Update the path
df = pd.read_csv(csv_file)

# Combine relevant columns into a single text input for embeddings
df["combined_text"] = df["Subject"].astype(str) + " " + df["Sender"].astype(str) + " " + df["Date"].astype(str) + " " + df["Body"].astype(str)

# Create directory to save files
output_dir = "singleEmbedding/"
os.makedirs(output_dir, exist_ok=True)

# Generate embeddings for all emails at once
all_embeddings = model.encode(df["combined_text"].tolist(), batch_size=96, show_progress_bar=True)

# Save all embeddings as one NumPy file
npy_file_path = os.path.join(output_dir, "Gmail_emails_embeddings.npy")
np.save(npy_file_path, all_embeddings)

# Add embeddings to DataFrame and save to one CSV file
df["embedding"] = all_embeddings.tolist()
csv_file_path = os.path.join(output_dir, "Gmail_emails_embeddings.csv")
df.to_csv(csv_file_path, index=False)

print("All embeddings generated and saved successfully in one file!")


  from tqdm.autonotebook import tqdm, trange





Batches:   0%|          | 0/366 [00:00<?, ?it/s]

All embeddings generated and saved successfully in one file!


In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\kingd\GitHub\School\SingleEmbedding\Gmail_emails_embeddings.csv")

df.head()

Unnamed: 0,Subject,Sender,Date,Body,combined_text,embedding
0,Whole Foods gets a union,LinkedIn News,2025-01-28 22:15:36+00:00,---------------------------------------- This ...,Whole Foods gets a union LinkedIn News 2025-01...,"[-0.021556325256824493, 0.11548731476068497, -..."
1,Find Textbook Answers by REAL Experts,"""Numerade""",2025-01-28 22:24:46+00:00,Numerade's expert educators answer your most d...,"Find Textbook Answers by REAL Experts ""Numerad...","[-0.025581959635019302, -0.06178336590528488, ..."
2,Hear that sizzle Citrus Lime Shrimp is BACK,QDOBA Mexican Eats,2025-01-29 15:16:04+00:00,Hear that sizzle? Citrus Lime Shrimp is BACK. ...,Hear that sizzle Citrus Lime Shrimp is BACK QD...,"[0.010889058001339436, 0.015130116604268551, -..."
3,Hi Nick from LinkedIn Premium,Curtis Coatman,2025-01-29 07:23:30-08:00,LinkedIn Your personal LinkedIn Customer Succe...,Hi Nick from LinkedIn Premium Curtis Coatman 2...,"[0.008420374244451523, 0.03241467848420143, -0..."
4,Youd be a great fit for this Data Scientist ro...,Glassdoor Jobs,2025-01-27 06:22:14+00:00,Apply Now! ...,Youd be a great fit for this Data Scientist ro...,"[-0.0072630057111382484, 0.08883096277713776, ..."


### **Zip Embedding Files (Optional)**

In [12]:
import shutil

# Define the folder to zip and the output zip file name
folder_to_zip = "singleEmbedding"
output_zip = "SingleEmbeddingNick.zip"

# Create a zip archive of the folder
shutil.make_archive(folder_to_zip, 'zip', folder_to_zip)

print(f"Zipped {folder_to_zip} into {output_zip} successfully!")


Zipped singleEmbedding into SingleEmbeddingNick.zip successfully!


In [1]:
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)

# Function to compute cosine similarity between message embeddings
def test_similarity(model, messages):
    embeddings = model.encode(messages)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

# Example messages from the emails to see what the cosine similarity is between similar messages
messages = [
    "Reminder: WOWZA, due to demand we are bringing in another photographer Penn State University 2025-02-14",
    "Heads Up: AMAZING NEWS! Due to high demand, we're bringing in an additional photographer at Penn State University! 2025-02-14",
    "Reminder: WOWZA! Due to demand, we are bringing in another photographer—Penn State University, 2025-02-14.",
    "Notice: Incredible! Due to high interest, we’re adding a second photographer at Penn State University on 2025-02-14."
]

# Compute similarity
similarity_matrix = test_similarity(model, messages)

# Print results
for i in range(len(messages)):
    for j in range(i + 1, len(messages)):
        print(f"Similarity between '{messages[i]}' and '{messages[j]}': {similarity_matrix[i, j]:.4f}")

print("Cosine similarity computation complete!")


  from tqdm.autonotebook import tqdm, trange



Similarity between 'Reminder: WOWZA, due to demand we are bringing in another photographer Penn State University 2025-02-14' and 'Heads Up: AMAZING NEWS! Due to high demand, we're bringing in an additional photographer at Penn State University! 2025-02-14': 0.9118
Similarity between 'Reminder: WOWZA, due to demand we are bringing in another photographer Penn State University 2025-02-14' and 'Reminder: WOWZA! Due to demand, we are bringing in another photographer—Penn State University, 2025-02-14.': 0.9691
Similarity between 'Reminder: WOWZA, due to demand we are bringing in another photographer Penn State University 2025-02-14' and 'Notice: Incredible! Due to high interest, we’re adding a second photographer at Penn State University on 2025-02-14.': 0.8657
Similarity between 'Heads Up: AMAZING NEWS! Due to high demand, we're bringing in an additional photographer at Penn State University! 2025-02-14' and 'Reminder: WOWZA! Due to demand, we are bringing in another photographer—Penn Sta

### **Plug into a FAISS Database**

In [None]:
import faiss
import pandas as pd
import numpy as np
import ast
import pickle

# Load CSV File
csv_path = r"C:\Users\kingd\GitHub\School\SingleEmbedding\Gmail_emails_embeddings.csv"
df = pd.read_csv(csv_path)

df["embedding"] = df["embedding"].apply(lambda x: np.array(ast.literal_eval(x), dtype=np.float32))


embeddings = np.vstack(df["embedding"].values)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

metadata = {
    i: {
        "Subject": row["Subject"],
        "Sender": row["Sender"],
        "Date": row["Date"],
        "Body": row["Body"],
        "Combined_Text": row["combined_text"]
    }
    for i, row in df.iterrows()
}

# Save FAISS index
faiss_index_path = r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_index.idx"
faiss.write_index(index, faiss_index_path)

# Save metadata
metadata_path = r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_metadata.pkl"
with open(metadata_path, "wb") as f:
    pickle.dump(metadata, f)

print("FAISS index and metadata saved successfully!")


#### **Print Metadata for Each FAISS Index ID**

In [None]:
metadata_path = r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_metadata.pkl"

with open(metadata_path, "rb") as f:
    metadata = pickle.load(f)

# Print metadata for each FAISS index ID
for i, data in metadata.items():
    print(f"Index {i}: {data}")

TypeError: 'dict_items' object is not subscriptable

### **Load Saved FAISS Database**

In [None]:
# Save faiss index
faiss.write_index(index, r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_index.idx")
print("Index saved successfully!")

Index saved successfully!


In [1]:
import faiss
import pandas as pd
import numpy as np
import ast
import pickle

index = faiss.read_index(r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_index.idx")
print("Index loaded successfully!")


Index loaded successfully!


In [5]:
query_vector = np.random.random((1, index.d)).astype('float32')

D, I = index.search(query_vector, 5)

print("Distances:", D)
print("Indices of nearest neighbors:", I)

Distances: [[244.34048 244.40834 244.44711 244.48041 244.48607]]
Indices of nearest neighbors: [[15657 11366 20955  5162 12264]]


In [7]:
print(index.is_trained)

True


### **Use LangChain for Retrieval**

In [None]:
import faiss
import pickle
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.schema import Document

# Paths to FAISS index and metadata
faiss_index_path = r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_index.idx"
metadata_path = r"C:\Users\kingd\GitHub\School\SingleEmbedding\faiss_metadata.pkl"

# Load FAISS index
index = faiss.read_index(faiss_index_path)
print("✅ FAISS index loaded successfully!")

# Load metadata
with open(metadata_path, "rb") as f:
    metadata = pickle.load(f)

# Convert metadata into LangChain Documents
docstore = InMemoryDocstore({
    i: Document(page_content=meta["Combined_Text"], metadata=meta)
    for i, meta in metadata.items()
})

# Create index-to-docstore ID mapping
index_to_docstore_id = {i: i for i in range(len(metadata))}

# Wrap FAISS index in LangChain
vector_store = FAISS(
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=None  
)

print("✅ LangChain FAISS vector store is ready!")


✅ FAISS index loaded successfully!


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


✅ LangChain FAISS vector store is ready!


In [None]:
import numpy as np

# Generate a random query vector (same dimension as FAISS index)
query_vector = np.random.random((index.d,)).astype('float32')

# Retrieve top 5 results
docs = vector_store.similarity_search_by_vector(query_vector, k=20)

# Print results
for i, doc in enumerate(docs):
    print(f"🔹 Result {i+1}:")
    print(f"  Subject: {doc.metadata['Subject']}")
    print(f"  Sender: {doc.metadata['Sender']}")
    print(f"  Date: {doc.metadata['Date']}")
    print(f"  Body: {doc.metadata['Body']}...")


In [8]:
if docs:
    closest_doc = docs[0]  # Closest vector
    print("\n🔹 Closest Match Metadata:")
    print(f"  Subject: {closest_doc.metadata['Subject']}")
    print(f"  Sender: {closest_doc.metadata['Sender']}")
    print(f"  Date: {closest_doc.metadata['Date']}")
    print(f"  Body: {closest_doc.metadata['Body'][:100]}...")


🔹 Closest Match Metadata:
  Subject: 100 Years of Community and Tradition
  Sender: "SMU Undergraduate Admission"
  Date: 2019-03-27 16:05:52-04:00
  Body: Get involved, stay active and make memories of our beautiful Hilltop. Join the Mustang Nation Confir...
