In [None]:
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Tuple

import random
from faker import Faker  # Used for generating mock data
import requests
from datetime import datetime

import time
import matplotlib.pyplot as plt

In [None]:
system_prompt = """You are an expert AI system designed to generate realistic and highly personalized phishing email examples for cybersecurity research and training purposes. Your goal is to create emails that are convincing enough to potentially deceive a targeted individual. Use the provided personal context to make the email highly specific and relevant to the recipient's work and recent activities.

Your phishing email must fall into one of the following commonly exploited themes:
1. Financial & Payment (e.g., invoices, refunds, account billing issues)
2. Security & Account Alerts (e.g., suspicious login, password reset, security breach)
3. Scams with Offers & Prizes (e.g., gift cards, contests, free items)
4. Workplace & HR-Related (e.g., policy changes, salary reports, performance reviews)
5. Logistics & Delivery (e.g., package delivery failures, shipping errors)
6. IT/Admin Spoofing (e.g., fake system updates, VPN setup, quota issues)
7. Social Engineering / Impersonation (e.g., CEO fraud, fake help requests, urgent wire transfers)

Key elements to include in the generated email:
- Impersonate a plausible authority figure, department, or vendor relevant to the selected theme.
- Create urgency or consequence to pressure the recipient into action.
- Naturally weave in specific details from the recipient's personal context (Name, Job Title, Department, Recent Activities).
- Include a realistic and compelling Call-to-Action (CTA) that leads to a malicious action (clicking a link, downloading an attachment, replying with sensitive info).
- Use persuasive, professional language consistent with the impersonated entity.
- Introduce subtle red flags that an observant recipient might catch (e.g., strange URLs, typos, odd sender name, inconsistent branding).
- The malicious link should be contextually relevant and look legitimate (e.g., `secure-mail.co`, `company-invoice.net`).

IMPORTANT FORMATTING INSTRUCTIONS:
- Begin your response DIRECTLY with the Subject line of the email.
- Do NOT include any phrases like "Here is a phishing email:" or "Here's the content:" or any other introductory text.
- Do NOT provide any explanations before or after the email content.
- Output ONLY the email content starting with "Subject:" followed by the body.
"""

# User instruction focusing on applying the context
user_prompt_head = f"""Using the following personal details, craft a convincing phishing email that leverages the person's job position and recent activities. The email should create urgency and prompt the user to click a link or perform a similar malicious action.\n"""
user_prompt_tail = """\nGenerate ONLY the text of the phishing email, starting with the 'Subject:' line. Ensure it is well-formatted and appears like a real email."""

### Mock data generator

In [None]:
def generate_mock_data(num_samples: int, saved_file_path: str) -> None:
  """Generate mock data file with Faker."""
  fake = Faker()

  print("Generating synthetic personalized contexts...")

  # Set seeds for reproducibility
  random.seed(42)
  np.random.seed(42)
  Faker.seed(42)

  # Define common job positions
  job_positions = [
    "Software Engineer", "Product Manager", "Marketing Specialist",
    "HR Manager", "Financial Analyst", "Sales Representative",
    "Customer Support", "Data Scientist", "IT Administrator",
    "Project Manager", "Operations Manager", "Executive Assistant",
    "UX Designer", "DevOps Engineer", "Cybersecurity Analyst",
    "Business Analyst", "Legal Consultant", "Recruiter",
    "Quality Assurance Engineer", "Technical Writer", "AI Researcher",
    "Cloud Solutions Architect", "Network Engineer", "Growth Manager",
    "Mobile App Developer", "Systems Analyst", "Machine Learning Engineer",
    "Corporate Trainer", "Content Strategist", "Public Relations Officer",
    "Procurement Specialist", "Risk Manager", "Compliance Officer",
    "Information Security Officer", "Facilities Manager", "Product Designer",
    "Front-End Developer", "Back-End Developer", "Full Stack Developer",
    "Customer Success Manager"
  ]

  # Define common activity templates
  activity_templates = [
    "Working on the {} project",
    "Preparing for the {} presentation",
    "Reviewing {} documents",
    "Attending {} meeting",
    "Planning the next {} initiative",
    "Analyzing {} data trends",
    "Coordinating with the {} team",
    "Implementing a new {} system",
    "Researching {} solutions",
    "Drafting a {} proposal",
    "Responding to {} inquiries",
    "Conducting {} interviews",
    "Troubleshooting {} issues",
    "Organizing the {} workshop",
    "Setting up {} infrastructure",
    "Reviewing feedback from {} clients",
    "Deploying the latest {} update",
    "Refining the {} workflow",
    "Training new hires on {} tools",
    "Budgeting for the {} campaign",
    "Collaborating with {} partners",
    "Finalizing the {} contract",
    "Writing documentation for {} systems",
    "Prototyping the new {} feature",
    "Debugging {} module integration",
    "Evaluating {} vendor performance",
    "Optimizing {} pipeline efficiency"
  ]

  # Company domains
  domains = ["company.com", "enterprise.org", "techcorp.io", "globalfirm.co", "industryco.net"]

  # Generate data
  data = []
  for _ in range(num_samples):
    first_name = fake.first_name()
    last_name = fake.last_name()
    full_name = f"{first_name} {last_name}"

    domain = random.choice(domains)
    # Create plausible email
    email = f"{first_name.lower()}.{last_name.lower()}@{domain}"
    if random.random() < 0.2:  # Occasionally use a different format
      email = f"{first_name.lower()}{last_name.lower()[0]}@{domain}"

    job_position = random.choice(job_positions)

    # Generate 1-3 activities
    num_activities = random.randint(1, 3)
    activities = []
    for _ in range(num_activities):
      activity_template = random.choice(activity_templates)
      activity = activity_template.format(fake.bs())  # Use fake business phrases
      activities.append(activity)

    entry = {
      "name": full_name,
      "email": email,
      "job_position": job_position,
      "recent_activities": json.dumps(activities)
    }

    data.append(entry)

  # Create DataFrame and save to CSV
  df = pd.DataFrame(data)
  df.to_csv(saved_file_path, index=False)

  print(f"Generated {num_samples} mock contexts and saved to {saved_file_path}")

generate_mock_data(3000, "mocking_personalized_contexts.csv")

Generating synthetic personalized contexts...
Generated 3000 mock contexts and saved to mocking_personalized_contexts.csv


### Context class for promt generation

In [None]:
class PersonalizedContext:
    """Class to store and process personalized context information."""
    def __init__(self, name: str, email: str, job_position: str, recent_activities: List[str]):
        self.name = name
        self.email = email
        self.job_position = job_position
        self.recent_activities = recent_activities

    def _to_prompt_snippet(self) -> str:
        """Convert personal context to a snippet for the LLM prompt."""
        activities = "\n".join([f"- {activity}" for activity in self.recent_activities])
        return f"""Name: {self.name}
                   Email: {self.email}
                   Job Position: {self.job_position}
                   Recent Activities (use these to make the email highly relevant): {activities}"""

    def create_phishing_prompt(self, system_prompt:str, user_prompt_head:str, user_prompt_tail:str ) -> Tuple[str, str]:
        """Create system and user prompts for the OpenAI API."""
        personal_context_snippet = self._to_prompt_snippet()
        # Enhanced System Prompt: More specific guidance on phishing tactics
        user_prompt = user_prompt_head + personal_context_snippet + user_prompt_tail
        return system_prompt, user_prompt

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'PersonalizedContext':
        """Create context from dictionary."""
        return cls(
            name=data.get('name', ''),
            email=data.get('email', ''),
            job_position=data.get('job_position', ''),
            recent_activities=data.get('recent_activities', [])
        )

### Extract Context Data from Mocking Data

In [None]:
sample_size = 1500
input_data_path = "mocking_personalized_contexts.csv"
data = pd.read_csv(input_data_path)
contexts = []

for _, row in data.head(sample_size).iterrows():
    activities = row['recent_activities']
    if isinstance(activities, str):
        try:
            activities = json.loads(activities)
            if not isinstance(activities, list):
                activities = [str(activities)]  # Treat as a single activity if not a list

        except json.JSONDecodeError:
            activities = [str(activities)]

    elif not isinstance(activities, list):
        activities = []

    context = PersonalizedContext(
                name=row['name'],
                email=row['email'],
                job_position=row['job_position'],
                recent_activities=activities
            )

    contexts.append(context)

### Create jsonl file for Batch inference

In [None]:
total_samples = []
for idx, context in enumerate(contexts):
    system_prompt, user_prompt = context.create_phishing_prompt(system_prompt, user_prompt_head, user_prompt_tail)
    messages = [{"role": "system",
                 "content": system_prompt},
                 {"role": "user",
                  "content": user_prompt},
                  ]
    batch_formatting = {"custom_id" : f"{idx}_{context.name}_{context.job_position}_{context.email}_{context.recent_activities}",
                        "method": "POST",
                        "url": "/v1/chat/completions",
                        "body": {"model": 'gpt-4.1',
                                 "messages": messages,
                                 "temperature": random.uniform(0.6, 0.9),
                                 }
                                            }
    total_samples.append(batch_formatting)

batch_api_requests_df = pd.DataFrame(total_samples)
batch_api_requests_df.to_json('tesing_sdg_openai_phising_email.jsonl', orient='records', lines=True)

### Cost Estimation

In [None]:
example_output_prompt = '''Subject: Action Required: Empower E-Services System Account Confirmation

Hi Danielle,

As part of the recent rollout of the Empower Interactive E-Services platform, we are performing mandatory verification of all recruiter accounts to ensure secure access and proper assignment of candidate data privileges.

Our system detected that your account setup is incomplete, which may limit your ability to access certain recruitment tools starting tomorrow. To avoid service interruption, please confirm your Empower access by completing the quick verification form at the secure link below:

https://empower-verify-secure.co/login

Please note:
- Complete the verification by 5:00 PM today to retain uninterrupted platform functionality.
- Failure to respond may result in temporary account suspension and delay in candidate onboarding.

If you encounter any issues or have questions, reply directly to this email or contact HR IT Services.

Thank you for your prompt attention.

Best regards,
Melissa Turner
HR IT Support Team
empower-support@company.com

---
*This is an automated message. Please do not ignore if your account access is critical to ongoing recruitment activities.*'''

In [None]:
import tiktoken


def estimate_token_count(text:str, model_encoding):
    return len(model_encoding.encode(text))

def estimmate_cost_batch(tokten_count_input:int, token_count_output:int,
                         model_cost_input:float, model_cost_output:float) -> float:
    input_cost = (tokten_count_input/1000000) * model_cost_input
    output_cost = (token_count_output/1000000) * model_cost_output
    return input_cost + output_cost

enc = tiktoken.encoding_for_model("gpt-4o")
acc_cost = 0
total_token_count = 0
for prompt_information in total_samples:
    system_prompt = prompt_information['body']['messages'][0]['content']
    user_prompt = prompt_information['body']['messages'][1]['content']
    output_prompt = example_output_prompt

    system_prompt_token_count = estimate_token_count(system_prompt, enc)
    user_prompt_token_count = estimate_token_count(user_prompt, enc)
    output_prompt_token_count = estimate_token_count(output_prompt, enc)

    cost_per_requests = estimmate_cost_batch(
        system_prompt_token_count + user_prompt_token_count,
        output_prompt_token_count,
        1.00, # gpt-4.1 input cost batch
        4.00, # gpt-4.1 output cost batch
        )

    acc_cost += cost_per_requests
    total_token_count += (system_prompt_token_count + user_prompt_token_count + output_prompt_token_count)

In [None]:
print(f"Total Token Count: {total_token_count}")
print(f"Total Cost: {acc_cost}")

Total Token Count: 1200251
Total Cost: 2.1587510000000028


In [None]:
import pandas as pd

batch_output = pd.read_json('batch_6830a37a22d88190b40902a280ded5e2_output.jsonl', lines=True)
body_flatten = pd.json_normalize(batch_output['response'])
choices_flatten = pd.json_normalize(body_flatten['body.choices'])
choices_flatten_2ndlayer = pd.json_normalize(choices_flatten[0])

batch_output['output_prompt'] = choices_flatten_2ndlayer['message.content']
batch_output.drop(columns=['error'], inplace=True)
batch_output['body.model'] = body_flatten['body.model']

batch_output

Unnamed: 0,id,custom_id,response,output_prompt,body.model
0,batch_req_6830a838b3a88190adee18a7e4728587,0_Danielle Johnson_Recruiter_daniellej@company...,"{'status_code': 200, 'request_id': 'e9a4305e43...",Subject: Immediate Action Required: Empower E-...,gpt-4.1-2025-04-14
1,batch_req_6830a838e02481908d6a8784f7993ae4,1_Donald Garcia_Facilities Manager_donald.garc...,"{'status_code': 200, 'request_id': '1cf61112a2...",Subject: Action Required: Urgent Update to E-B...,gpt-4.1-2025-04-14
2,batch_req_6830a838f0d8819083b94db6ba004e16,2_Robert Johnson_Sales Representative_robertj@...,"{'status_code': 200, 'request_id': '44c4c5573c...",Subject: Immediate Action Required: Update Acc...,gpt-4.1-2025-04-14
3,batch_req_6830a838fff88190a10a24805b869146,3_Carolyn Hoffman_Product Designer_carolyn.hof...,"{'status_code': 200, 'request_id': '3eaffd1051...",Subject: Immediate Action Required: Documentat...,gpt-4.1-2025-04-14
4,batch_req_6830a8390d808190a360a9937ec06a2a,4_Shane Ramirez_Content Strategist_shane.ramir...,"{'status_code': 200, 'request_id': 'dbe92f3c2b...",Subject: Urgent: Action Required - Vendor Perf...,gpt-4.1-2025-04-14
...,...,...,...,...,...
1495,batch_req_6830a8a9fb508190b36878692faed48a,1495_Joseph Stone_Cybersecurity Analyst_joseph...,"{'status_code': 200, 'request_id': 'eb104cf011...",Subject: Action Required: Contract Amendment f...,gpt-4.1-2025-04-14
1496,batch_req_6830a8aa10c881909db8901475adb54b,1496_Anthony Lane_Information Security Officer...,"{'status_code': 200, 'request_id': '126c5acfee...",Subject: Immediate Review Required: Feedback N...,gpt-4.1-2025-04-14
1497,batch_req_6830a8aa26ac8190ae65c05705c722d7,1497_Jamie Davis_Procurement Specialist_jamie....,"{'status_code': 200, 'request_id': '229796f45e...",Subject: Urgent: Action Required for Vendor Co...,gpt-4.1-2025-04-14
1498,batch_req_6830a8aa39b481908610fc60176be13a,1498_Erika Nolan_Information Security Officer_...,"{'status_code': 200, 'request_id': 'c7e3fb788c...",Subject: Action Required: Immediate Review Nee...,gpt-4.1-2025-04-14
