In [2]:
import os
import json
import pandas as pd
import torch
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_mistralai import ChatMistralAI
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from typing import Optional
import numpy as np


# Loading Test cases

In [3]:
def load_test_cases(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)


In [4]:
test_cases = load_test_cases("ground_truth_data.json")

In [5]:
test_cases[10]

{'email_content': 'Hello,\n\nPlease review the attached document and let me know your thoughts.\n\nRegards,\nNina Thompson\nHR Manager\nnina.thompson@hrsolutions.com\nHR Solutions',
 'expected_output': {'name': 'Nina Thompson',
  'email': 'nina.thompson@hrsolutions.com',
  'job_title': 'HR Manager',
  'company': 'HR Solutions'}}

In [6]:
class EmailSignature(BaseModel):
    """Information extracted from an email signature."""
    name: Optional[str] = Field(default=None, description="The full name of the person")
    email: Optional[str] = Field(default=None, description="The email address of the person")
    phone: Optional[str] = Field(default=None, description="The phone number of the person")
    job_title: Optional[str] = Field(default=None, description="The job title of the person")
    company: Optional[str] = Field(default=None, description="The company of the person")
    address: Optional[str] = Field(default=None, description="The address of the person")
    website: Optional[str] = Field(default=None, description="The website of the person or company")
    social_media: Optional[dict] = Field(default=None, description="Social media profiles")

In [75]:
def build_prompt():
    prompt_template = """
    Extract the signature information from the following email content.
    Return the information in JSON format matching the following structure:
    {{
        "name": "Full Name",
        "email": "email@example.com",
        "phone": "Phone number",
        "job_title": "Job Title",
        "company": "Company Name",
        "address": "Full Address",
        "website": "Website URL",
        "social_media": {{
            "linkedin": "LinkedIn URL",
            "twitter": "Twitter handle",
            // other social media
        }}
    }}

    If a field is not present in the email, omit it from the JSON.

    Email content:
    {email_content}

    Extracted signature information:
    """ # Use double curly braces to escape the curly braces in the JSON structure
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["email_content"]
    )

    return prompt

In [66]:
MISTRAL_API_KEY ="pWV8pKnCKvnmQjyG9FXtQaZP2fIdBcEH"

#model = ChatMistralAI(model="mfrom sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:01<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [76]:
def llm(model="mistral-large-latest"):
    return ChatMistralAI(model=model, temperature=0, api_key=MISTRAL_API_KEY)

In [83]:
def extract_signature(email_content: str, model="mistral-large-latest") -> str:
    prompt = build_prompt()
    mistral_llm = llm(model)
    chain = LLMChain(llm=mistral_llm, prompt=prompt)

    result = chain.run(email_content=email_content)
    return result

In [101]:
import random
import httpx
def extract_signature(email_content: str, model="mistral-large-latest", max_retries=10):
    prompt = build_prompt()
    mistral_llm = llm(model)
    chain = LLMChain(llm=mistral_llm, prompt=prompt)

    for attempt in range(max_retries):
        try:
            result = chain.run(email_content=email_content)
            return result
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                wait_time = 2 ** attempt + random.uniform(0, 1)
                print(f"Rate limit exceeded. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise

    raise Exception("Max retries exceeded")

In [80]:
def map_progress(pool, seq, f):
    results = []
    with tqdm(total=len(seq)) as progress:
        futures = []
        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)
        for future in futures:
            result = future.result()
            results.append(result)
    return results

## Evaluating Mistral Model

In [113]:
# Dictionary to store extracted signatures
extracted_signatures = {}

# Process each email in the ground truth data
for i, rec in enumerate(tqdm(test_cases)):
    if i in extracted_signatures:
        continue

    email_content = rec['email_content']
    try:
        extracted_signature = extract_signature(email_content)
    except Exception as e:
        print(f"Failed to extract signature for email {i}: {str(e)}")
        continue

    # Try to parse the extracted signature as JSON
    try:
        signature_json = json.loads(extracted_signature)
    except json.JSONDecodeError:
        #print(f"Warning: Could not parse JSON for email {i}. Storing as string.")
        signature_json = extracted_signature

    extracted_signatures[i] = {
        'original_email': email_content,
        'expected_output': rec['expected_output'],
        'extracted_signature': signature_json
    }

    # Save intermediate results after each successful extraction
    with open('extracted_signature.json', 'w') as f:
        json.dump(extracted_signatures, f, indent=2)

  0%|          | 0/14 [00:00<?, ?it/s]

Rate limit exceeded. Retrying in 1.20 seconds...
Rate limit exceeded. Retrying in 1.11 seconds...


In [114]:
results_mistral = [None] * len(test_cases)
for i, val in extracted_signatures.items():
    results_mistral[i] = val.copy()
    results_mistral[i].update(test_cases[i])

In [115]:
results_mistral[0]

{'original_email': "Hello team,\n\nPlease see the attached agenda for tomorrow's meeting.\n\nRegards,\nChris\nChris Martin\nOperations Manager\noperations@logisticsco.com\nLogistics Co.",
 'expected_output': {'name': 'Grace Thompson',
  'email': 'grace.thompson@events.com',
  'job_title': 'Event Coordinator',
  'company': 'Events Corp.',
  'phone': '+1 (555) 432-1098',
  'website': 'www.events.com'},
 'extracted_signature': '```json\n{\n    "name": "Chris Martin",\n    "email": "operations@logisticsco.com",\n    "job_title": "Operations Manager",\n    "company": "Logistics Co."\n}\n```',
 'email_content': 'Hello,\n\nPlease find below the details of the upcoming event.\n\nThank you,\nGrace Thompson\nEvent Coordinator\ngrace.thompson@events.com\nEvents Corp.\nPhone: +1 (555) 432-1098\nWebsite: www.events.com\nLinkedIn: linkedin.com/in/gracethompson'}

In [117]:
df_mistral = pd.DataFrame(results_mistral)


In [118]:
df_mistral.head()

Unnamed: 0,original_email,expected_output,extracted_signature,email_content
0,"Hello team,\n\nPlease see the attached agenda ...","{'name': 'Grace Thompson', 'email': 'grace.tho...","```json\n{\n ""name"": ""Chris Martin"",\n ""...","Hello,\n\nPlease find below the details of the..."
1,"Hello team,\n\nPlease see the attached agenda ...","{'name': 'Emily Rogers', 'email': 'emily.roger...","```json\n{\n ""name"": ""Chris Martin"",\n ""...","Hello,\n\nThank you for your interest in our s..."
2,"Hello team,\n\nPlease see the attached agenda ...","{'name': 'Michael Brown', 'email': 'michael.br...","```json\n{\n ""name"": ""Chris Martin"",\n ""...","Dear Team,\n\nThis is a reminder for our upcom..."
3,"Hello team,\n\nPlease see the attached agenda ...","{'name': 'Kevin Lee', 'job_title': 'Senior Dev...","```json\n{\n ""name"": ""Chris Martin"",\n ""...","Hi everyone,\n\nJust a quick note to let you k..."
4,"Hello team,\n\nPlease see the attached agenda ...","{'name': 'Emily Rogers', 'email': 'emily.roger...","```json\n{\n ""name"": ""Chris Martin"",\n ""...","Hello,\n\nThank you for your interest in our s..."


# Evaluating gpt-2 model

In [127]:
def extract_signature(email_content: str, llm):
    prompt = build_prompt()
    chain = LLMChain(llm=llm, prompt=prompt)
    result = chain.run(email_content=email_content)
    return result

In [121]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from langchain.llms import HuggingFacePipeline

def load_gpt2_model(model_name="gpt2-large"):
    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    llm = GPT2LMHeadModel.from_pretrained(model_name)

    # Create the text generation pipeline
    pipe = pipeline(
        "text-generation",
        model=llm,
        tokenizer=tokenizer,
        max_length=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.2
    )

    # Wrap the pipeline in a HuggingFacePipeline object
    llm = HuggingFacePipeline(pipeline=pipe)

    return llm

In [122]:
# Load the GPT-2 model
gpt2_llm = load_gpt2_model()

In [128]:
# List to store results
results_gpt2 = [None] * len(test_cases)

# Process each email in the ground truth data
for i, rec in enumerate(tqdm(test_cases)):
    email_content = rec['email_content']
    try:
        extracted_signature = extract_signature(email_content,gpt2_llm)
    except Exception as e:
        print(f"Failed to extract signature for email {i}: {str(e)}")
        continue

    # Try to parse the extracted signature as JSON
    try:
        signature_json = json.loads(extracted_signature)
    except json.JSONDecodeError:
        print(f"Warning: Could not parse JSON for email {i}. Storing as string.")
        signature_json = extracted_signature

    results_gpt2[i] = {
        'extracted_signature': signature_json,
        'original_content': email_content,
    }
    results_gpt2[i].update(rec)  # Add all fields from the original record

  0%|          | 0/14 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




In [129]:
#Create DataFrame
df_gpt2 = pd.DataFrame(results_gpt2)


In [131]:
df_gpt2.head()

Unnamed: 0,extracted_signature,original_content,email_content,expected_output
0,\n Extract the signature information from t...,"Hello,\n\nPlease find below the details of the...","Hello,\n\nPlease find below the details of the...","{'name': 'Grace Thompson', 'email': 'grace.tho..."
1,\n Extract the signature information from t...,"Hello,\n\nThank you for your interest in our s...","Hello,\n\nThank you for your interest in our s...","{'name': 'Emily Rogers', 'email': 'emily.roger..."
2,\n Extract the signature information from t...,"Dear Team,\n\nThis is a reminder for our upcom...","Dear Team,\n\nThis is a reminder for our upcom...","{'name': 'Michael Brown', 'email': 'michael.br..."
3,\n Extract the signature information from t...,"Hi everyone,\n\nJust a quick note to let you k...","Hi everyone,\n\nJust a quick note to let you k...","{'name': 'Kevin Lee', 'job_title': 'Senior Dev..."
4,\n Extract the signature information from t...,"Hello,\n\nThank you for your interest in our s...","Hello,\n\nThank you for your interest in our s...","{'name': 'Emily Rogers', 'email': 'emily.roger..."


In [None]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in extracting email signature information.
Your task is to analyze the given email content and extract the signature information into a structured JSON format.
Only extract information that is explicitly present in the email signature.
If a field is not present in the signature, omit it from the JSON output.
"""
USER_PROMPT_TEMPLATE = """
Extract the email signature information from the following email and structure it into JSON format:

Email:
{email_content}

JSON Output:
"""
def create_extraction_chain(llm):
    prompt = PromptTemplate(
        template=SYSTEM_PROMPT + "\n\n" + USER_PROMPT_TEMPLATE,
        input_variables=["email_content"]
    )
    return LLMChain(llm=llm, prompt=prompt)

In [None]:
def extract_signature_info(chain, email_content):
    result = chain.run(email_content=email_content)
    try:
        return json.loads(result)
    except json.JSONDecodeError:
        print(f"Invalid JSON in the result: {result}")
        return {}

#Cosine similarity metric


In [None]:
MISTRAL_API_KEY ="pWV8pKnCKvnmQjyG9FXtQaZP2fIdBcEH"

#model = ChatMistralAI(model="mfrom sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
answer_orig = 'Yes, sessions are recorded if you miss one. Everything is recorded, allowing you to catch up on any missed content. Additionally, you can ask questions in advance for office hours and have them addressed during the live stream. You can also ask questions in Slack.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

v_llm.dot(v_orig)

0.7591171