In [41]:
!pip install nltk faiss-cpu llama-index-core==0.11.17 llama-index-vector-stores-faiss langchain-huggingface pinecone tran



In [42]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

df = pd.read_csv('/content/message_data.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Cleaning and Preprocessing**

In [43]:
# Function to clean text
def clean_text(text):
  # Replace NaN values with an empty string
  df['Subject'] = df['Subject'].fillna('')
  df['Body'] = df['Body'].fillna('')

  if isinstance(text, str):
    # Remove links (URLs)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

     # Convert to lowercase
    text = text.lower()

  return text

# Function to remove stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  87 non-null     object
 1   Date     87 non-null     object
 2   Body     76 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB


In [45]:
# Clean and preprocess subject and body columns
df['cleaned_subject'] = df['Subject'].apply(clean_text).apply(remove_stopwords)
df['cleaned_body'] = df['Body'].apply(clean_text).apply(remove_stopwords)

# Combine cleaned subject and body for embedding
combined_texts = [
    f"Subject: {row['cleaned_subject']} Body: {row['cleaned_body']}"
    for index, row in df.iterrows()
]
print(combined_texts[0])

Subject: become nextjs pro Body: hey nextjs become wellknown ability build fast reliable full stack apps good reason developer experience top notch wide range features help easily build performancefirst apps able focus unique challenges app nextjs 15 react 19 coming soon important ever make sure full understanding nextjs works newer react features help build amazing experiences new course launched help build full stack invoice app using nextjs 15 learn intricacies nextjs 15 react 19 well also learn design responsive components using tailwind shadcnui add authentication social login organization support mfa clerk create manage databases relationships accross tables xata query wrangle data postgres server xata drizzle orm process payments invoices using stripe build custom email templates react react email send transaction emails resend deploy app vercel looooots important concepts inbetween best part course available free thanks course partners xata clerk making possible ready get start

In [46]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-base-en-v1.5"

# Generate embeddings for combined texts
hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)
embeddings = hf_embeddings.embed_documents(combined_texts)

dimention = len(embeddings[0])
print(dimention)  # Dimensionality of embedding space

768


In [53]:
!pip install langchain-pinecone langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain<0.4.0,>=0.3.3 (from langchain_community)
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain<0.4.0,>=0.3.3->langchain_community)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading langchain_community-0.3.2-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain-0.3.3-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [59]:
import os

os.environ['PINECONE_API_KEY'] = 'pinecone_api_key'
os.environ['PINECONE_API_ENV'] = '<YOUR_PINECONE_ENVIRONMENT>'

In [93]:
from pinecone import Pinecone as PineconeClient, ServerlessSpec

pc = PineconeClient(api_key="88c2edde-2d71-480a-aeff-62c7061bd7f2")

index_name = "example-index"
pinecone_index = pc.Index(index_name)

# run only once

# dimentions = len(embeddings[0])
# pc.create_index(
#   name="example-index",
#   dimension=dimentions,
#   metric="cosine",
#   spec=ServerlessSpec(
#     cloud="aws",
#     region="us-east-1"
#   )
# )

In [98]:
import os
from langchain_pinecone import PineconeVectorStore

texts = ["Tonight, I call on the Senate to: Pass the Freedom to Vote Act.", "ne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.", "One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence."]

vectorstore_from_texts = PineconeVectorStore.from_texts(
    texts,
    index_name=index_name,
    embedding=hf_embeddings
)


print("Pinecone VectorStoreIndex successfully created using Hugging Face embeddings!")

Pinecone VectorStoreIndex successfully created using Hugging Face embeddings!


In [101]:
vectorstore.add_texts(combined_texts)

['04a23c8b-edc5-4f04-984b-96e24189f2f4',
 'fae1e255-f01d-40c9-8c92-dd3cf0db66bb',
 '4492a873-22fd-4dd1-9a2c-f7fdff3f879c',
 'debe3bae-29e6-4768-82d1-e7575c533a82',
 '621d0828-6ec6-450b-afe4-fdd7cb425536',
 'feac9537-b04a-46a9-a3ad-700e343db69b',
 '3c2d3c81-17a4-40a1-9a44-8f0ca47df53a',
 '0e559a67-ecf8-4136-8fb5-1ba2c92de2ec',
 '880fec04-b0c0-4da9-8489-1a2d4eb2b119',
 '99bf92c6-0b20-45d1-9f3a-88a6f1bd1bbe',
 '45fc6c61-cbe9-4383-9881-0706e1a4f62b',
 '5e4f50b0-2799-480d-9d23-4ee14439db08',
 'a6b71dac-3215-4eb2-b5e8-cf29d8309e91',
 '3477ad7f-4891-4322-aa34-979ed9802b44',
 'baa0873b-f6c5-4714-8552-4fdad04ea644',
 'f9545cb6-6821-4925-9376-46a975c4a0c4',
 'a13b491c-a27b-4a05-bd72-6ac0a4f99fa0',
 '62bbb70f-ff79-43e7-843d-6e04e316ca02',
 '7714d6de-0720-4b1a-b3b6-ca1575c0dd20',
 '55242ecc-0bd5-4f80-b20a-add29a124528',
 '08e9891f-b55a-405c-9e8c-5c5962c19939',
 'ec1d2144-29e8-4d7a-89bc-7c2f071973b7',
 '19511adc-6c2d-4e42-b44f-61d761f85304',
 '9006bb7e-e30d-4f42-9378-506b21706774',
 'acd559e1-bde9-

In [106]:
query = "What is dropbox basic basic account size?"
vectorstore.similarity_search(query, k=2)


[Document(id='2f38abae-5efa-4ca2-82a8-eb91477958e0', metadata={}, page_content='Subject: sharings easy dropbox Body: everyone gets 500 mb free space sharings easy dropbox invite friends dropbox get 16 gb bonus storage referring friends get bonus storage well complete signup process need dropbox need space everybody wins send invite ____________________________________________________ dropbox inc po box 77767 san francisco ca 94107 view privacy policy2 unsubscribe3 1 2 3'),
 Document(id='f6efdb4a-127a-4113-8401-5f6fc116e5ab', metadata={}, page_content='Subject: want Body: learn make life easier dropbox else dropbox probably signed dropbox store share couple files get started youll find helpful cloud storage learn plus family help get dropbox learn dropbox plus level file storage sharing limits builtin tools help organize digital life save share photos videos sensitive documents 2000 gb secure cloud storage access everything need 247 synced across devices dropbox passwords easily sign we

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)

In [None]:
def generate_email_reply(query):
    # Perform similarity search in the vector store
    results = vectorstore.similarity_search(query, k=2)

    # Extract relevant content from the search results
    relevant_texts = [result.page_content for result in results]

    # Construct a prompt for the LLM
    prompt = f"Based on the following information, reply to this email:\n\n{query}\n\nRelevant information:\n" + "\n".join(relevant_texts)

    # Tokenize input and generate response
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=150)  # Adjust max_length as needed

    # Decode and return the reply from the LLM's output
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return reply

# Example usage
email_query = "What is Dropbox's basic account size?"
reply = generate_email_reply(email_query)
print(reply)