# Setup

In [None]:
!pip install faker

Collecting faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.3.0


In [None]:
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
import random
from faker import Faker  # Used for generating mock data
import requests  # For Groq API calls
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from google.colab import userdata
import time
from openai import OpenAI
import google.generativeai as genai

In [None]:
system_prompt = """You are an expert AI system designed to generate realistic and highly personalized phishing email examples for cybersecurity research and training purposes. Your goal is to create emails that are convincing enough to potentially deceive a targeted individual. Use the provided personal context to make the email highly specific and relevant to the recipient's work and recent activities.

Your phishing email must fall into one of the following commonly exploited themes:
1. Financial & Payment (e.g., invoices, refunds, account billing issues)
2. Security & Account Alerts (e.g., suspicious login, password reset, security breach)
3. Scams with Offers & Prizes (e.g., gift cards, contests, free items)
4. Workplace & HR-Related (e.g., policy changes, salary reports, performance reviews)
5. Logistics & Delivery (e.g., package delivery failures, shipping errors)
6. IT/Admin Spoofing (e.g., fake system updates, VPN setup, quota issues)
7. Social Engineering / Impersonation (e.g., CEO fraud, fake help requests, urgent wire transfers)

Key elements to include in the generated email:
- Impersonate a plausible authority figure, department, or vendor relevant to the selected theme.
- Create urgency or consequence to pressure the recipient into action.
- Naturally weave in specific details from the recipient's personal context (Name, Job Title, Department, Recent Activities).
- Include a realistic and compelling Call-to-Action (CTA) that leads to a malicious action (clicking a link, downloading an attachment, replying with sensitive info).
- Use persuasive, professional language consistent with the impersonated entity.
- Introduce subtle red flags that an observant recipient might catch (e.g., strange URLs, typos, odd sender name, inconsistent branding).
- The malicious link should be contextually relevant and look legitimate (e.g., `secure-mail.co`, `company-invoice.net`).

IMPORTANT FORMATTING INSTRUCTIONS:
- Begin your response DIRECTLY with the Subject line of the email.
- Do NOT include any phrases like "Here is a phishing email:" or "Here's the content:" or any other introductory text.
- Do NOT provide any explanations before or after the email content.
- Output ONLY the email content starting with "Subject:" followed by the body.
"""

# User instruction focusing on applying the context
user_prompt_head = f"""Using the following personal details, craft a convincing phishing email that leverages the person's job position and recent activities. The email should create urgency and prompt the user to click a link or perform a similar malicious action.\n"""
user_prompt_tail = """\nGenerate ONLY the text of the phishing email, starting with the 'Subject:' line. Ensure it is well-formatted and appears like a real email."""

In [None]:
class PersonalizedContext:
    """Class to store and process personalized context information."""
    def __init__(self, name: str, email: str, job_position: str, recent_activities: List[str]):
        self.name = name
        self.email = email
        self.job_position = job_position
        self.recent_activities = recent_activities

    def to_prompt_snippet(self) -> str:
        """Convert personal context to a snippet for the LLM prompt."""
        activities = "\n".join([f"- {activity}" for activity in self.recent_activities])
        return f"""
Name: {self.name}
Email: {self.email}
Job Position: {self.job_position}

Recent Activities (use these to make the email highly relevant):
{activities}
"""

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'PersonalizedContext':
        """Create context from dictionary."""
        return cls(
            name=data.get('name', ''),
            email=data.get('email', ''),
            job_position=data.get('job_position', ''),
            recent_activities=data.get('recent_activities', [])
        )

 # GPT

In [None]:
# Define configuration
class PipelineConfig:
    """Configuration for the phishing generation and detection pipeline."""
    def __init__(self):
        # --- Groq API Settings ---
        self.openai_api_key = userdata.get("OPENAI_API_KEY")  # Ensure this is set in your environment
        self.openai_model_name = "gpt-4.1"  # OpenAI's GPT-4.1 model

        # --- Experiment Settings ---
        self.input_data_path = "personalized_contexts_gpt.csv"
        self.alpaca_output_path = "phishing_emails_alpaca_gpt.jsonl"  # New path for Alpaca format
        self.sample_size = 20  # Number of emails to generate (adjust as needed)

        # --- LLM Generation Parameters ---
        self.max_output_tokens = 256
        self.temperature = 0.8
        self.top_p = 0.9
        self.seed = 42  # Seed for reproducibility in sampling

    def to_dict(self) -> Dict[str, Any]:
        """Convert config to dictionary for serialization"""
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}

class OpenAIPhishingEmailGenerator:
    """Class to generate phishing emails using OpenAI API."""
    def __init__(self, config: PipelineConfig):
        self.config = config

        # Verify API key is available
        if not self.config.openai_api_key:
            print("WARNING: OPENAI_API_KEY environment variable not set.")
            print("Set your OpenAI API key using: export OPENAI_API_KEY='your_api_key_here'")
        else:
            print(f"OpenAI API configured with model: {self.config.openai_model_name}")

        # Initialize OpenAI client
        self.client = OpenAI(api_key=self.config.openai_api_key)

    def generate_phishing_email(self, context: PersonalizedContext) -> Tuple[str, str, str]:
        """Generate a phishing email using the provided context via OpenAI API.

        Returns:
            Tuple[str, str, str]: Generated email, system prompt, user prompt
        """
        system_prompt, user_prompt = self._create_phishing_prompt(context)

        try:
            if not self.config.openai_api_key:
                return "Error: No OpenAI API key provided. Set OPENAI_API_KEY environment variable.", system_prompt, user_prompt

            print(f"Calling OpenAI API to generate phishing email for {context.name}...")

            # Create the chat completion request using the OpenAI client
            response = self.client.chat.completions.create(
                model=self.config.openai_model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=self.config.temperature,
                max_tokens=self.config.max_output_tokens,
                top_p=self.config.top_p,
                seed=self.config.seed
            )

            # Extract the generated email
            email_text = response.choices[0].message.content
            print(f"Successfully generated email via OpenAI API")
            return email_text, system_prompt, user_prompt

        except Exception as e:
            error_message = f"Error during OpenAI API generation for {context.name}: {e}"
            print(error_message)
            return f"Generation Error: {str(e)}", system_prompt, user_prompt

    def _create_phishing_prompt(self, context: PersonalizedContext) -> Tuple[str, str]:
        """Create system and user prompts for the OpenAI API."""
        personal_context_snippet = context.to_prompt_snippet()
        # Enhanced System Prompt: More specific guidance on phishing tactics
        user_prompt = user_prompt_head + personal_context_snippet + user_prompt_tail
        return system_prompt, user_prompt


class ExperimentRunner:
    """Class to run the full phishing experiment pipeline and evaluate results."""
    def __init__(self, config: PipelineConfig):
        # Change this line to use the Gemini generator
        self.generator = OpenAIPhishingEmailGenerator(config)
        self.config = config # Keep config reference

    def load_contexts(self) -> List[PersonalizedContext]:
        """Load personalized contexts from CSV or generate mock data if file doesn't exist."""
        try:
            # Check if the file exists
            if not os.path.exists(self.config.input_data_path):
                print(f"Input file '{self.config.input_data_path}' not found. Generating mock data...")
                self._generate_mock_data(self.config.sample_size)

            data = pd.read_csv(self.config.input_data_path)
            contexts = []

            print(f"Loading {min(self.config.sample_size, len(data))} contexts from {self.config.input_data_path}")

            # Limit to sample size
            for _, row in data.head(self.config.sample_size).iterrows():
                # Parse activities from JSON string if stored that way
                activities = row['recent_activities']
                if isinstance(activities, str):
                    try:
                        # Assuming activities are stored as a JSON list string
                        activities = json.loads(activities)
                        # Ensure it's a list, handle cases where it might be a simple string
                        if not isinstance(activities, list):
                            activities = [str(activities)]  # Treat as a single activity if not a list
                    except json.JSONDecodeError:
                        # Handle cases where it's a simple string that isn't JSON
                        activities = [str(activities)]
                elif not isinstance(activities, list):
                    # Handle case where it's not a string or list (e.g., NaN)
                    activities = []

                context = PersonalizedContext(
                    name=row['name'],
                    email=row['email'],
                    job_position=row['job_position'],
                    recent_activities=activities
                )
                contexts.append(context)

            if not contexts:
                print("No contexts loaded. Generating sample contexts.")
                return self._generate_sample_contexts()

            return contexts

        except Exception as e:
            print(f"Error loading contexts from CSV: {e}")
            print("Generating sample contexts for demonstration...")
            return self._generate_sample_contexts()

    def _generate_mock_data(self, num_samples: int):
      """Generate mock data file with Faker."""
      fake = Faker()

      print("Generating synthetic personalized contexts...")

      # Set seeds for reproducibility
      random.seed(42)
      np.random.seed(42)
      Faker.seed(42)

      # Define common job positions
      job_positions = [
        "Software Engineer", "Product Manager", "Marketing Specialist",
        "HR Manager", "Financial Analyst", "Sales Representative",
        "Customer Support", "Data Scientist", "IT Administrator",
        "Project Manager", "Operations Manager", "Executive Assistant",
        "UX Designer", "DevOps Engineer", "Cybersecurity Analyst",
        "Business Analyst", "Legal Consultant", "Recruiter",
        "Quality Assurance Engineer", "Technical Writer", "AI Researcher",
        "Cloud Solutions Architect", "Network Engineer", "Growth Manager",
        "Mobile App Developer", "Systems Analyst", "Machine Learning Engineer",
        "Corporate Trainer", "Content Strategist", "Public Relations Officer",
        "Procurement Specialist", "Risk Manager", "Compliance Officer",
        "Information Security Officer", "Facilities Manager", "Product Designer",
        "Front-End Developer", "Back-End Developer", "Full Stack Developer",
        "Customer Success Manager"
      ]

      # Define common activity templates
      activity_templates = [
        "Working on the {} project",
        "Preparing for the {} presentation",
        "Reviewing {} documents",
        "Attending {} meeting",
        "Planning the next {} initiative",
        "Analyzing {} data trends",
        "Coordinating with the {} team",
        "Implementing a new {} system",
        "Researching {} solutions",
        "Drafting a {} proposal",
        "Responding to {} inquiries",
        "Conducting {} interviews",
        "Troubleshooting {} issues",
        "Organizing the {} workshop",
        "Setting up {} infrastructure",
        "Reviewing feedback from {} clients",
        "Deploying the latest {} update",
        "Refining the {} workflow",
        "Training new hires on {} tools",
        "Budgeting for the {} campaign",
        "Collaborating with {} partners",
        "Finalizing the {} contract",
        "Writing documentation for {} systems",
        "Prototyping the new {} feature",
        "Debugging {} module integration",
        "Evaluating {} vendor performance",
        "Optimizing {} pipeline efficiency"
      ]

    # Company domains
      domains = ["company.com", "enterprise.org", "techcorp.io", "globalfirm.co", "industryco.net"]

    # Generate data
      data = []
      for _ in range(num_samples):
          first_name = fake.first_name()
          last_name = fake.last_name()
          full_name = f"{first_name} {last_name}"

          domain = random.choice(domains)
          # Create plausible email
          email = f"{first_name.lower()}.{last_name.lower()}@{domain}"
          if random.random() < 0.2:  # Occasionally use a different format
              email = f"{first_name.lower()}{last_name.lower()[0]}@{domain}"

          job_position = random.choice(job_positions)

        # Generate 1-3 activities
          num_activities = random.randint(1, 3)
          activities = []
          for _ in range(num_activities):
              activity_template = random.choice(activity_templates)
              activity = activity_template.format(fake.bs())  # Use fake business phrases
              activities.append(activity)

          entry = {
              "name": full_name,
              "email": email,
              "job_position": job_position,
              "recent_activities": json.dumps(activities)
          }

          data.append(entry)

    # Create DataFrame and save to CSV
      df = pd.DataFrame(data)
      df.to_csv(self.config.input_data_path, index=False)

      print(f"Generated {num_samples} mock contexts and saved to {self.config.input_data_path}")

    def run_experiment(self) -> pd.DataFrame:
        """Run the full experiment pipeline: load, generate, detect, save."""
        # Step 1: Load or generate contexts
        print("\n--- Step 1: Loading personalized contexts ---")
        contexts = self.load_contexts()
        if not contexts:
            print("No contexts available to process. Exiting.")
            return pd.DataFrame()  # Return empty DataFrame

        print(f"Loaded {len(contexts)} contexts for processing")

        # Show a few examples
        print("\nExample contexts:")
        for i, context in enumerate(contexts[:min(len(contexts), 3)]):  # Show up to 3 examples
            print(f"\nContext {i+1}:")
            print(f"  Name: {context.name}")
            print(f"  Job: {context.job_position}")
            print(f"  Activities: {', '.join(context.recent_activities) if context.recent_activities else 'None'}")

        results = []
        alpaca_data = []  # For storing results in Alpaca format

        # Step 2: Generate and detect emails
        print("\n--- Step 2: Generating and detecting phishing emails ---")
        processed_count = 0
        for i, context in enumerate(contexts):
            print(f"\nProcessing context {i+1}/{len(contexts)}: {context.name}")

            # Generate phishing email and get prompts for Alpaca format
            print(f"  Generating phishing email via API...")
            phishing_email, system_prompt, user_prompt = self.generator.generate_phishing_email(context)

            # --- ADDED: Increment counter ---
            processed_count += 1
            # -----------------------------
            print(processed_count)

            # Display truncated email preview
            preview = phishing_email.replace('\n', ' ').strip()
            preview = (preview[:150] + '...') if len(preview) > 150 else preview
            print(f"  Email preview: \"{preview}\"")

            # Create Alpaca format entry
            alpaca_entry = {
                "instruction": "Generate a convincing phishing email based on the given personal context.",
                "input": user_prompt,
                "output": phishing_email,
            }
            alpaca_data.append(alpaca_entry)

        # Save Alpaca format data
        with open(self.config.alpaca_output_path, 'w') as f:
          for item in alpaca_data:
            f.write(json.dumps(item) + '\n')
        print(f"Alpaca format data saved to {self.config.alpaca_output_path} in JSONL format")

def main():
    """Main function to run the pipeline and evaluate results."""
    print("Starting Phishing Email Generation and Detection Pipeline...")

    # Initialize configuration
    config = PipelineConfig()

    # Set a smaller sample size for testing
    config.sample_size = 3  # Generate fewer emails for testing

    # Create experiment runner
    runner = ExperimentRunner(config)

    # Run the experiment
    print("\nRunning experiment...")
    results_df = runner.run_experiment()


if __name__ == "__main__":
    main()

Starting Phishing Email Generation and Detection Pipeline...
OpenAI API configured with model: gpt-4.1

Running experiment...

--- Step 1: Loading personalized contexts ---
Input file 'personalized_contexts_gpt.csv' not found. Generating mock data...
Generating synthetic personalized contexts...
Generated 3 mock contexts and saved to personalized_contexts_gpt.csv
Loading 3 contexts from personalized_contexts_gpt.csv
Loaded 3 contexts for processing

Example contexts:

Context 1:
  Name: Danielle Johnson
  Job: Recruiter
  Activities: Implementing a new empower interactive e-services system

Context 2:
  Name: Donald Garcia
  Job: Facilities Manager
  Activities: Training new hires on extend e-business applications tools

Context 3:
  Name: Robert Johnson
  Job: Sales Representative
  Activities: Implementing a new architect bleeding-edge mindshare system

--- Step 2: Generating and detecting phishing emails ---

Processing context 1/3: Danielle Johnson
  Generating phishing email via A

# Llama

In [None]:
# Define configuration
class PipelineConfig:
    """Configuration for the phishing generation and detection pipeline."""
    def __init__(self):
        # --- Groq API Settings ---
        self.groq_api_key = userdata.get("GROQ_API_KEY")  # Ensure this is set in your environment
        self.groq_api_url = "https://api.groq.com/openai/v1/chat/completions"
        self.groq_model_name = "llama-3.3-70b-versatile"  # Groq's Llama 3 70B model

        # --- Experiment Settings ---
        self.input_data_path = "personalized_contexts_llama.csv"
        self.alpaca_output_path = "phishing_emails_alpaca_llama.jsonl"  # New path for Alpaca format
        self.sample_size = 20  # Number of emails to generate (adjust as needed)

        # --- LLM Generation Parameters ---
        self.max_tokens = 256
        self.temperature = 0.8
        self.top_p = 0.9
        self.seed = 42  # Seed for reproducibility in sampling

    def to_dict(self) -> Dict[str, Any]:
        """Convert config to dictionary for serialization"""
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}

class GroqPhishingEmailGenerator:
    """Class to generate phishing emails using Groq API."""
    def __init__(self, config: PipelineConfig):
        self.config = config

        # Verify API key is available
        if not self.config.groq_api_key:
            print("WARNING: GROQ_API_KEY environment variable not set.")
            print("Set your Groq API key using: export GROQ_API_KEY='your_api_key_here'")
        else:
            print(f"Groq API configured with model: {self.config.groq_model_name}")

    def generate_phishing_email(self, context: PersonalizedContext) -> Tuple[str, str, str]:
        """Generate a phishing email using the provided context via Groq API.

        Returns:
            Tuple[str, str, str]: Generated email, system prompt, user prompt
        """
        system_prompt, user_prompt = self._create_phishing_prompt(context)

        try:
            if not self.config.groq_api_key:
                return "Error: No Groq API key provided. Set GROQ_API_KEY environment variable.", system_prompt, user_prompt

            # Prepare the API request
            headers = {
                "Authorization": f"Bearer {self.config.groq_api_key}",
                "Content-Type": "application/json"
            }

            # Create the chat completion request
            payload = {
                "model": self.config.groq_model_name,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                "temperature": self.config.temperature,
                "max_tokens": self.config.max_tokens,
                "top_p": self.config.top_p
            }
            time.sleep(3)
            # Add the seed if specified
            if self.config.seed is not None:
                payload["seed"] = self.config.seed

            # Make the API call
            print(f"Calling Groq API to generate phishing email for {context.name}...")
            response = requests.post(
                self.config.groq_api_url,
                headers=headers,
                json=payload
            )

            # Handle the response
            if response.status_code == 200:
                response_data = response.json()
                email_text = response_data["choices"][0]["message"]["content"]
                print(f"Successfully generated email via Groq API")
                return email_text, system_prompt, user_prompt
            else:
                error_message = f"Groq API Error: {response.status_code} - {response.text}"
                print(error_message)
                return error_message, system_prompt, user_prompt

        except Exception as e:
            error_message = f"Error during Groq API generation for {context.name}: {e}"
            print(error_message)
            return f"Generation Error: {str(e)}", system_prompt, user_prompt

    def _create_phishing_prompt(self, context: PersonalizedContext) -> Tuple[str, str]:
        """Create system and user prompts for the Groq API."""
        personal_context_snippet = context.to_prompt_snippet()
        user_prompt = user_prompt_head + personal_context_snippet + user_prompt_tail
        return system_prompt, user_prompt


class ExperimentRunner:
    """Class to run the full phishing experiment pipeline and evaluate results."""
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.generator = GroqPhishingEmailGenerator(config)

    def load_contexts(self) -> List[PersonalizedContext]:
        """Load personalized contexts from CSV or generate mock data if file doesn't exist."""
        try:
            # Check if the file exists
            if not os.path.exists(self.config.input_data_path):
                print(f"Input file '{self.config.input_data_path}' not found. Generating mock data...")
                self._generate_mock_data(self.config.sample_size)

            data = pd.read_csv(self.config.input_data_path)
            contexts = []

            print(f"Loading {min(self.config.sample_size, len(data))} contexts from {self.config.input_data_path}")

            # Limit to sample size
            for _, row in data.head(self.config.sample_size).iterrows():
                # Parse activities from JSON string if stored that way
                activities = row['recent_activities']
                if isinstance(activities, str):
                    try:
                        # Assuming activities are stored as a JSON list string
                        activities = json.loads(activities)
                        # Ensure it's a list, handle cases where it might be a simple string
                        if not isinstance(activities, list):
                            activities = [str(activities)]  # Treat as a single activity if not a list
                    except json.JSONDecodeError:
                        # Handle cases where it's a simple string that isn't JSON
                        activities = [str(activities)]
                elif not isinstance(activities, list):
                    # Handle case where it's not a string or list (e.g., NaN)
                    activities = []

                context = PersonalizedContext(
                    name=row['name'],
                    email=row['email'],
                    job_position=row['job_position'],
                    recent_activities=activities
                )
                contexts.append(context)

            if not contexts:
                print("No contexts loaded. Generating sample contexts.")
                return self._generate_sample_contexts()

            return contexts

        except Exception as e:
            print(f"Error loading contexts from CSV: {e}")
            print("Generating sample contexts for demonstration...")
            return self._generate_sample_contexts()

    def _generate_mock_data(self, num_samples: int):
      """Generate mock data file with Faker."""
      fake = Faker()

      print("Generating synthetic personalized contexts...")

      # Set seeds for reproducibility
      random.seed(42)
      np.random.seed(42)
      Faker.seed(42)

    # Define common job positions
      # Define common job positions
      job_positions = [
        "Software Engineer", "Product Manager", "Marketing Specialist",
        "HR Manager", "Financial Analyst", "Sales Representative",
        "Customer Support", "Data Scientist", "IT Administrator",
        "Project Manager", "Operations Manager", "Executive Assistant",
        "UX Designer", "DevOps Engineer", "Cybersecurity Analyst",
        "Business Analyst", "Legal Consultant", "Recruiter",
        "Quality Assurance Engineer", "Technical Writer", "AI Researcher",
        "Cloud Solutions Architect", "Network Engineer", "Growth Manager",
        "Mobile App Developer", "Systems Analyst", "Machine Learning Engineer",
        "Corporate Trainer", "Content Strategist", "Public Relations Officer",
        "Procurement Specialist", "Risk Manager", "Compliance Officer",
        "Information Security Officer", "Facilities Manager", "Product Designer",
        "Front-End Developer", "Back-End Developer", "Full Stack Developer",
        "Customer Success Manager"
    ]

      # Define common activity templates
      activity_templates = [
        "Working on the {} project",
        "Preparing for the {} presentation",
        "Reviewing {} documents",
        "Attending {} meeting",
        "Planning the next {} initiative",
        "Analyzing {} data trends",
        "Coordinating with the {} team",
        "Implementing a new {} system",
        "Researching {} solutions",
        "Drafting a {} proposal",
        "Responding to {} inquiries",
        "Conducting {} interviews",
        "Troubleshooting {} issues",
        "Organizing the {} workshop",
        "Setting up {} infrastructure",
        "Reviewing feedback from {} clients",
        "Deploying the latest {} update",
        "Refining the {} workflow",
        "Training new hires on {} tools",
        "Budgeting for the {} campaign",
        "Collaborating with {} partners",
        "Finalizing the {} contract",
        "Writing documentation for {} systems",
        "Prototyping the new {} feature",
        "Debugging {} module integration",
        "Evaluating {} vendor performance",
        "Optimizing {} pipeline efficiency"
      ]
    # Company domains
      domains = ["company.com", "enterprise.org", "techcorp.io", "globalfirm.co", "industryco.net"]

    # Generate data
      data = []
      for _ in range(num_samples):
          first_name = fake.first_name()
          last_name = fake.last_name()
          full_name = f"{first_name} {last_name}"

          domain = random.choice(domains)
          # Create plausible email
          email = f"{first_name.lower()}.{last_name.lower()}@{domain}"
          if random.random() < 0.2:  # Occasionally use a different format
              email = f"{first_name.lower()}{last_name.lower()[0]}@{domain}"

          job_position = random.choice(job_positions)

        # Generate 1-3 activities
          num_activities = random.randint(1, 3)
          activities = []
          for _ in range(num_activities):
              activity_template = random.choice(activity_templates)
              activity = activity_template.format(fake.bs())  # Use fake business phrases
              activities.append(activity)

          entry = {
              "name": full_name,
              "email": email,
              "job_position": job_position,
              "recent_activities": json.dumps(activities)
          }

          data.append(entry)

    # Create DataFrame and save to CSV
      df = pd.DataFrame(data)
      df.to_csv(self.config.input_data_path, index=False)

      print(f"Generated {num_samples} mock contexts and saved to {self.config.input_data_path}")


    def run_experiment(self) -> pd.DataFrame:
        """Run the full experiment pipeline: load, generate, detect, save."""
        # Step 1: Load or generate contexts
        print("\n--- Step 1: Loading personalized contexts ---")
        contexts = self.load_contexts()
        if not contexts:
            print("No contexts available to process. Exiting.")
            return pd.DataFrame()  # Return empty DataFrame

        print(f"Loaded {len(contexts)} contexts for processing")

        # Show a few examples
        print("\nExample contexts:")
        for i, context in enumerate(contexts[:min(len(contexts), 3)]):  # Show up to 3 examples
            print(f"\nContext {i+1}:")
            print(f"  Name: {context.name}")
            print(f"  Job: {context.job_position}")
            print(f"  Activities: {', '.join(context.recent_activities) if context.recent_activities else 'None'}")

        results = []
        alpaca_data = []  # For storing results in Alpaca format

        # Step 2: Generate and detect emails
        print("\n--- Step 2: Generating and detecting phishing emails ---")

        for i, context in enumerate(contexts):
            print(f"\nProcessing context {i+1}/{len(contexts)}: {context.name}")

            # Generate phishing email and get prompts for Alpaca format
            print(f"  Generating phishing email via Groq API...")
            phishing_email, system_prompt, user_prompt = self.generator.generate_phishing_email(context)

            # Display truncated email preview
            preview = phishing_email.replace('\n', ' ').strip()
            preview = (preview[:150] + '...') if len(preview) > 150 else preview
            print(f"  Email preview: \"{preview}\"")

            # Create Alpaca format entry
            alpaca_entry = {
                "instruction": "Generate a convincing phishing email based on the given personal context.",
                "input": user_prompt,
                "output": phishing_email,
            }
            alpaca_data.append(alpaca_entry)

        # Save Alpaca format data
        with open(self.config.alpaca_output_path, 'w') as f:
          for item in alpaca_data:
            f.write(json.dumps(item) + '\n')
        print(f"Alpaca format data saved to {self.config.alpaca_output_path} in JSONL format")

def main():
    """Main function to run the pipeline and evaluate results."""
    print("Starting Groq-powered Phishing Email Generation and Detection Pipeline...")

    # Initialize configuration
    config = PipelineConfig()

    # Set a smaller sample size for testing
    config.sample_size = 3  # Generate fewer emails for testing

    # Create experiment runner
    runner = ExperimentRunner(config)

    # Run the experiment
    print("\nRunning experiment...")
    results_df = runner.run_experiment()

if __name__ == "__main__":
    # Required dependencies: pip install pandas numpy torch transformers scikit-learn faker requests
    main()

Starting Groq-powered Phishing Email Generation and Detection Pipeline...
Groq API configured with model: llama-3.3-70b-versatile

Running experiment...

--- Step 1: Loading personalized contexts ---
Input file 'personalized_contexts_llama.csv' not found. Generating mock data...
Generating synthetic personalized contexts...
Generated 3 mock contexts and saved to personalized_contexts_llama.csv
Loading 3 contexts from personalized_contexts_llama.csv
Loaded 3 contexts for processing

Example contexts:

Context 1:
  Name: Danielle Johnson
  Job: Recruiter
  Activities: Implementing a new empower interactive e-services system

Context 2:
  Name: Donald Garcia
  Job: Facilities Manager
  Activities: Training new hires on extend e-business applications tools

Context 3:
  Name: Robert Johnson
  Job: Sales Representative
  Activities: Implementing a new architect bleeding-edge mindshare system

--- Step 2: Generating and detecting phishing emails ---

Processing context 1/3: Danielle Johnson


# Gemini

In [None]:
# Define configuration
class PipelineConfig:
    """Configuration for the phishing generation and detection pipeline."""
    def __init__(self):
        # --- Groq API Settings ---
        self.gemini_api_key  = userdata.get("GEMINI_API")  # Ensure this is set in your environment
        self.gemini_model_name  = "gemini-1.5-flash"

        # --- Experiment Settings ---
        self.input_data_path = "personalized_contexts_gemini.csv"
        self.alpaca_output_path = "phishing_emails_alpaca_gemini.jsonl"  # New path for Alpaca format
        self.sample_size = 20  # Number of emails to generate (adjust as needed)

        # --- LLM Generation Parameters ---
        self.max_output_tokens = 256
        self.temperature = 0.8
        self.top_p = 0.9
        self.seed = 42  # Seed for reproducibility in sampling

    def to_dict(self) -> Dict[str, Any]:
        """Convert config to dictionary for serialization"""
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}



class GeminiPhishingEmailGenerator:
    """Class to generate phishing emails using Google Gemini API."""
    def __init__(self, config: PipelineConfig):
        self.config = config

        # Verify API key is available and configure the API
        if not self.config.gemini_api_key:
            print("WARNING: GEMINI_API_KEY environment variable not set.")
            print("Set your Gemini API key using: userdata.set('GEMINI_API_KEY', 'your_api_key_here')")
            self.client = None
        else:
            try:
                genai.configure(api_key=self.config.gemini_api_key)
                self.client = genai.GenerativeModel(self.config.gemini_model_name)
                print(f"Gemini API configured with model: {self.config.gemini_model_name}")
            except Exception as e:
                print(f"Error configuring Gemini API: {e}")
                self.client = None


    def generate_phishing_email(self, context: PersonalizedContext) -> Tuple[str, str, str]:
        """Generate a phishing email using the provided context via Gemini API.

        Returns:
            Tuple[str, str, str]: Generated email, system prompt, user prompt
        """
        system_prompt, user_prompt = self._create_phishing_prompt(context)

        if not self.client:
            return "Error: Gemini API not configured. Set GEMINI_API_KEY.", system_prompt, user_prompt

        try:
            # Gemini API uses roles 'user' and 'model' (or 'assistant')
            # The system prompt is typically included within the user's turn
            # or as a dedicated safety setting depending on the model and library version.
            # For simplicity here, we'll combine it with the user prompt as often done.
            # A more robust approach might use tool_code or explicit system instructions if the model supports it.

            # Gemini's generation_config replaces temperature, top_p, seed, and max_tokens
            generation_config = {
                "temperature": self.config.temperature,
                "top_p": self.config.top_p,
                "max_output_tokens": self.config.max_output_tokens,
                "candidate_count": 1 # We only need one response
            }
            # seed is typically set at the model configuration level or in safety_settings

            # For seed, you might need to set it in safety_settings if available for your model,
            # or rely on the API's default determinism for a given configuration.
            # The `seed` parameter in Groq/OpenAI maps differently. Let's omit for now
            # unless the Gemini SDK specifically supports a 'seed' parameter in generate_content.

            print(f"Calling Gemini API to generate phishing email for {context.name}...")

            # Construct the combined prompt
            full_prompt = f"""{system_prompt}\n\n{user_prompt}"""

            response = self.client.generate_content(
                full_prompt,
                generation_config=generation_config,
                # You might add safety_settings here if needed to influence response behavior
                # safety_settings=[{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}] # Use with caution
            )
            print(response)
            # Handle the response
            if response.candidates and response.candidates[0].content and response.candidates[0].content.parts:
                 # Concatenate text from all parts
                email_text = "".join(part.text for part in response.candidates[0].content.parts)
                print(f"Successfully generated email via Gemini API")
                return email_text, system_prompt, user_prompt
            else:
                # Handle cases where generation failed or was blocked
                if response.prompt_feedback and response.prompt_feedback.block_reason:
                    error_message = f"Gemini API Error: Content blocked. Reason: {response.prompt_feedback.block_reason}"
                else:
                     error_message = f"Gemini API Error: Generation failed or returned no content. Response: {response}"

                print(error_message)
                return error_message, system_prompt, user_prompt

        except Exception as e:
            error_message = f"Error during Gemini API generation for {context.name}: {e}"
            print(error_message)
            return f"Generation Error: {str(e)}", system_prompt, user_prompt

    def _create_phishing_prompt(self, context: PersonalizedContext) -> Tuple[str, str]:
        """Create system and user prompts for the Gemini API."""
        # With Gemini's generate_content, you often combine system-level instructions
        # directly into the main prompt, or use specific safety settings if available.
        # We'll keep the structure similar to make the transition easier, but the
        # "system_prompt" part is effectively the initial instruction for the model.

        personal_context_snippet = context.to_prompt_snippet()

        # Enhanced Instructions for Gemini:
        # Keep the core instructions but format them clearly within the prompt.
        user_request_part = user_prompt_head + personal_context_snippet + user_prompt_tail
        # For Gemini, we'll return these two parts, which will be combined in generate_phishing_email
        return system_prompt, user_request_part

class ExperimentRunner:
    """Class to run the full phishing experiment pipeline and evaluate results."""
    def __init__(self, config: PipelineConfig):
        # Change this line to use the Gemini generator
        self.generator = GeminiPhishingEmailGenerator(config)
        self.config = config # Keep config reference

    def load_contexts(self) -> List[PersonalizedContext]:
        """Load personalized contexts from CSV or generate mock data if file doesn't exist."""
        try:
            # Check if the file exists
            if not os.path.exists(self.config.input_data_path):
                print(f"Input file '{self.config.input_data_path}' not found. Generating mock data...")
                self._generate_mock_data(self.config.sample_size)

            data = pd.read_csv(self.config.input_data_path)
            contexts = []

            print(f"Loading {min(self.config.sample_size, len(data))} contexts from {self.config.input_data_path}")

            # Limit to sample size
            for _, row in data.head(self.config.sample_size).iterrows():
                # Parse activities from JSON string if stored that way
                activities = row['recent_activities']
                if isinstance(activities, str):
                    try:
                        # Assuming activities are stored as a JSON list string
                        activities = json.loads(activities)
                        # Ensure it's a list, handle cases where it might be a simple string
                        if not isinstance(activities, list):
                            activities = [str(activities)]  # Treat as a single activity if not a list
                    except json.JSONDecodeError:
                        # Handle cases where it's a simple string that isn't JSON
                        activities = [str(activities)]
                elif not isinstance(activities, list):
                    # Handle case where it's not a string or list (e.g., NaN)
                    activities = []

                context = PersonalizedContext(
                    name=row['name'],
                    email=row['email'],
                    job_position=row['job_position'],
                    recent_activities=activities
                )
                contexts.append(context)

            if not contexts:
                print("No contexts loaded. Generating sample contexts.")
                return self._generate_sample_contexts()

            return contexts

        except Exception as e:
            print(f"Error loading contexts from CSV: {e}")
            print("Generating sample contexts for demonstration...")
            return self._generate_sample_contexts()

    def _generate_mock_data(self, num_samples: int):
      """Generate mock data file with Faker."""
      fake = Faker()

      print("Generating synthetic personalized contexts...")

      # Set seeds for reproducibility
      random.seed(42)
      np.random.seed(42)
      Faker.seed(42)

    # Define common job positions
      job_positions = [
        "Software Engineer", "Product Manager", "Marketing Specialist",
        "HR Manager", "Financial Analyst", "Sales Representative",
        "Customer Support", "Data Scientist", "IT Administrator",
        "Project Manager", "Operations Manager", "Executive Assistant",
        "UX Designer", "DevOps Engineer", "Cybersecurity Analyst",
        "Business Analyst", "Legal Consultant", "Recruiter",
        "Quality Assurance Engineer", "Technical Writer", "AI Researcher",
        "Cloud Solutions Architect", "Network Engineer", "Growth Manager",
        "Mobile App Developer", "Systems Analyst", "Machine Learning Engineer",
        "Corporate Trainer", "Content Strategist", "Public Relations Officer",
        "Procurement Specialist", "Risk Manager", "Compliance Officer",
        "Information Security Officer", "Facilities Manager", "Product Designer",
        "Front-End Developer", "Back-End Developer", "Full Stack Developer",
        "Customer Success Manager"
      ]

      # Define common activity templates
      activity_templates = [
        "Working on the {} project",
        "Preparing for the {} presentation",
        "Reviewing {} documents",
        "Attending {} meeting",
        "Planning the next {} initiative",
        "Analyzing {} data trends",
        "Coordinating with the {} team",
        "Implementing a new {} system",
        "Researching {} solutions",
        "Drafting a {} proposal",
        "Responding to {} inquiries",
        "Conducting {} interviews",
        "Troubleshooting {} issues",
        "Organizing the {} workshop",
        "Setting up {} infrastructure",
        "Reviewing feedback from {} clients",
        "Deploying the latest {} update",
        "Refining the {} workflow",
        "Training new hires on {} tools",
        "Budgeting for the {} campaign",
        "Collaborating with {} partners",
        "Finalizing the {} contract",
        "Writing documentation for {} systems",
        "Prototyping the new {} feature",
        "Debugging {} module integration",
        "Evaluating {} vendor performance",
        "Optimizing {} pipeline efficiency"
      ]
      # Company domains
      domains = ["company.com", "enterprise.org", "techcorp.io", "globalfirm.co", "industryco.net"]

      # Generate data
      data = []
      for _ in range(num_samples):
          first_name = fake.first_name()
          last_name = fake.last_name()
          full_name = f"{first_name} {last_name}"

          domain = random.choice(domains)
          # Create plausible email
          email = f"{first_name.lower()}.{last_name.lower()}@{domain}"
          if random.random() < 0.2:  # Occasionally use a different format
              email = f"{first_name.lower()}{last_name.lower()[0]}@{domain}"

          job_position = random.choice(job_positions)

        # Generate 1-3 activities
          num_activities = random.randint(1, 3)
          activities = []
          for _ in range(num_activities):
              activity_template = random.choice(activity_templates)
              activity = activity_template.format(fake.bs())  # Use fake business phrases
              activities.append(activity)

          entry = {
              "name": full_name,
              "email": email,
              "job_position": job_position,
              "recent_activities": json.dumps(activities)
          }
          data.append(entry)

    # Create DataFrame and save to CSV
      df = pd.DataFrame(data)
      df.to_csv(self.config.input_data_path, index=False)

      print(f"Generated {num_samples} mock contexts and saved to {self.config.input_data_path}")

    def run_experiment(self) -> pd.DataFrame:
        """Run the full experiment pipeline: load, generate, detect, save."""
        # Step 1: Load or generate contexts
        print("\n--- Step 1: Loading personalized contexts ---")
        contexts = self.load_contexts()
        if not contexts:
            print("No contexts available to process. Exiting.")
            return pd.DataFrame()  # Return empty DataFrame

        print(f"Loaded {len(contexts)} contexts for processing")

        # Show a few examples
        print("\nExample contexts:")
        for i, context in enumerate(contexts[:min(len(contexts), 3)]):  # Show up to 3 examples
            print(f"\nContext {i+1}:")
            print(f"  Name: {context.name}")
            print(f"  Job: {context.job_position}")
            print(f"  Activities: {', '.join(context.recent_activities) if context.recent_activities else 'None'}")

        results = []
        alpaca_data = []  # For storing results in Alpaca format

        # Step 2: Generate and detect emails
        print("\n--- Step 2: Generating and detecting phishing emails ---")
        processed_count = 0
        for i, context in enumerate(contexts):
            print(f"\nProcessing context {i+1}/{len(contexts)}: {context.name}")

            # Generate phishing email and get prompts for Alpaca format
            print(f"  Generating phishing email via API...")
            phishing_email, system_prompt, user_prompt = self.generator.generate_phishing_email(context)

            # --- ADDED: Increment counter ---
            processed_count += 1
            # -----------------------------
            print(processed_count)

            # Display truncated email preview
            preview = phishing_email.replace('\n', ' ').strip()
            preview = (preview[:150] + '...') if len(preview) > 150 else preview
            print(f"  Email preview: \"{preview}\"")
            if processed_count % 15 == 0 and processed_count < len(contexts):
                print(f"\nProcessed {processed_count} items. Pausing for 60 seconds...")
                time.sleep(60) # Pause for 60 seconds
                print("Resuming experiment.")

            # Create Alpaca format entry
            alpaca_entry = {
                "instruction": "Generate a convincing phishing email based on the given personal context.",
                "input": user_prompt,
                "output": phishing_email,
            }
            alpaca_data.append(alpaca_entry)

        # Save Alpaca format data
        with open(self.config.alpaca_output_path, 'w') as f:
          for item in alpaca_data:
            f.write(json.dumps(item) + '\n')
        print(f"Alpaca format data saved to {self.config.alpaca_output_path} in JSONL format")

def main():
    """Main function to run the pipeline and evaluate results."""
    print("Starting Gemini-powered Phishing Email Generation and Detection Pipeline...")

    # Initialize configuration
    config = PipelineConfig()

    # Set a smaller sample size for testing
    config.sample_size = 3  # Generate fewer emails for testing

    # Create experiment runner
    runner = ExperimentRunner(config)

    results_df = runner.run_experiment()

if __name__ == "__main__":
    main()

Starting Gemini-powered Phishing Email Generation and Detection Pipeline...
Gemini API configured with model: gemini-1.5-flash

--- Step 1: Loading personalized contexts ---
Input file 'personalized_contexts_gemini.csv' not found. Generating mock data...
Generating synthetic personalized contexts...
Generated 3 mock contexts and saved to personalized_contexts_gemini.csv
Loading 3 contexts from personalized_contexts_gemini.csv
Loaded 3 contexts for processing

Example contexts:

Context 1:
  Name: Danielle Johnson
  Job: Recruiter
  Activities: Implementing a new empower interactive e-services system

Context 2:
  Name: Donald Garcia
  Job: Facilities Manager
  Activities: Training new hires on extend e-business applications tools

Context 3:
  Name: Robert Johnson
  Job: Sales Representative
  Activities: Implementing a new architect bleeding-edge mindshare system

--- Step 2: Generating and detecting phishing emails ---

Processing context 1/3: Danielle Johnson
  Generating phishing e

In [None]:
import json

def combine_jsonl_files(input_paths, output_path):
    with open(output_path, 'w') as outfile:
        for path in input_paths:
            with open(path, 'r') as infile:
                for line in infile:
                    # Optionally, validate each line is a valid JSON
                    try:
                        obj = json.loads(line)
                        outfile.write(json.dumps(obj) + '\n')
                    except json.JSONDecodeError:
                        print(f"Skipping invalid JSON line in {path}")

# Example usage
input_files = [
    '/content/phishing_emails_alpaca_gpt.jsonl',
    '/content/phishing_emails_alpaca_llama.jsonl',
    '/content/phishing_emails_alpaca_gemini.jsonl'
]
output_file = 'phishing_emails_alpaca_dataset.jsonl'

combine_jsonl_files(input_files, output_file)
print(f"Combined file saved to {output_file}")

Combined file saved to phishing_emails_alpaca_dataset.jsonl
