## Phising URL Dataset collected form https://github.com/JPCERTCC/phishurl-list
generating appropriate balanced dataset for my project using actual URLs from above site.

Data generated contains following malicious onents in the body part:
1. Fake Login Forms
2. Redirects to Malicious Sites
3. Hidden Scripts
4. Fake CAPTCHA
5. Keylogger Script
6. Clickjacking
7. Fake Security Alerts
8. Obfuscated or Encoded Scripts
9. Embedded Malware
10. Social Engineering with Branding
11. Actual Phishing URLs from above link


Equal Number of Phishing and Non-Phishing Rows:

    The number of non_phishing_urls is set to match the number of phishing_urls, ensuring a balanced dataset.

Random Non-Phishing URLs:

    Non-phishing URLs are generated dynamically to match the phishing dataset size.

Shuffling:

    Combined data is shuffled using sample(frac=1) to mix phishing and non-phishing samples.

Output:

    The final balanced dataset is saved to the specified output file.
    

In [3]:
import pandas as pd
import random
import string
from faker import Faker
from datetime import datetime

def generate_random_email():
    """Generate a random email address."""
    domains = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]
    username = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
    domain = random.choice(domains)
    return f"{username}@{domain}"

def generate_random_date(start_year=2019, end_year=2024):
    """Generate a random date between the start and end year."""
    faker = Faker()
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    return faker.date_between_dates(date_start=start_date, date_end=end_date).strftime('%Y-%m-%d')

def generate_random_email_body(url, phishing=True):
    """Generate a random email body with phishing or legitimate content."""
    if phishing:
        templates = [
            f"Dear User,\n\nWe have noticed suspicious activity on your account. Please verify your identity by clicking the following link: {url}\n\nThank you.",
            f"Hello,\n\nYour account has been temporarily suspended due to security reasons. To reactivate it, click on this link: {url}\n\nBest regards.",
            f"Greetings,\n\nCongratulations! You are eligible for an exclusive reward. Claim your reward by visiting this link: {url}\n\nKind regards.",
            f"Attention,\n\nYour account password has been compromised. Reset your password immediately using the following link: {url}\n\nStay safe."
        ]
    else:
        templates = [
            f"Dear Valued Customer,\n\nWe hope you're enjoying our services. Visit us anytime at {url} for updates and offers.\n\nWarm regards.",
            f"Hello,\n\nYour recent transaction was successful. Access your invoice at {url}\n\nBest wishes.",
            f"Greetings,\n\nThank you for subscribing to our newsletter. Visit our site at {url} for the latest news.\n\nBest regards.",
            f"Hi,\n\nWe noticed you changed your account settings. If this was you, no action is needed. Otherwise, visit {url} for help.\n\nThank you."
        ]
    return random.choice(templates)

def create_balanced_dataset(input_file, output_file):
    """
    Create a balanced dataset with sender, receiver, date, subject, body, urls, and label columns.
    """
    try:
        # Load the combined .csv file
        df = pd.read_csv(input_file)

        # Extract phishing URLs from the dataset
        phishing_urls = df['URL'].tolist()

        # Generate an equal number of non-phishing URLs
        non_phishing_urls = [f"https://example{random.randint(1, 999)}.com" for _ in range(len(phishing_urls))]

        # Generate phishing samples
        phishing_data = {
            "sender": [generate_random_email() for _ in phishing_urls],
            "receiver": [generate_random_email() for _ in phishing_urls],
            "date": [generate_random_date() for _ in phishing_urls],
            "subject": ["Urgent Account Notice" for _ in phishing_urls],
            "body": [generate_random_email_body(url, phishing=True) for url in phishing_urls],
            "urls": phishing_urls,
            "label": [1 for _ in phishing_urls]
        }

        # Generate non-phishing samples
        non_phishing_data = {
            "sender": [generate_random_email() for _ in non_phishing_urls],
            "receiver": [generate_random_email() for _ in non_phishing_urls],
            "date": [generate_random_date() for _ in non_phishing_urls],
            "subject": ["Your Account Update" for _ in non_phishing_urls],
            "body": [generate_random_email_body(url, phishing=False) for url in non_phishing_urls],
            "urls": non_phishing_urls,
            "label": [0 for _ in non_phishing_urls]
        }

        # Combine phishing and non-phishing data
        phishing_df = pd.DataFrame(phishing_data)
        non_phishing_df = pd.DataFrame(non_phishing_data)

        # Ensure the dataset is balanced by concatenating equal numbers of phishing and non-phishing rows
        balanced_df = pd.concat([phishing_df, non_phishing_df], ignore_index=True)
        balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

        # Save the balanced dataset to a .csv file
        balanced_df.to_csv(output_file, index=False)
        print(f"Balanced dataset created and saved to {output_file} with {len(balanced_df)} rows.")
    except Exception as e:
        print(f"Error creating balanced dataset: {e}")

# Example usage
input_file = "data_output/combined.csv"  # Input file path
output_file = "data_output/phishing_email_dataset.csv"  # Output file path

create_balanced_dataset(input_file, output_file)


Balanced dataset created and saved to data_output/phishing_email_dataset.csv with 440012 rows.


In [5]:
import pandas as pd 

import pandas as pd

# Replace 'file_path.csv' with the path to your CSV file
df = pd.read_csv("data_output/phishing_email_dataset.csv")

# Display the first few rows of the DataFrame to confirm it loaded correctly
print(df.head())


# Replace 'df' with the name of your DataFrame
top_50_rows = df[['urls', 'label']].head(50)
print(top_50_rows)

                 sender              receiver        date  \
0  kt4makf7@hotmail.com    zz4pcd76@gmail.com  2019-02-19   
1  qpsu80x5@outlook.com  hivq6q6n@outlook.com  2021-04-23   
2    tsr596ly@gmail.com  hutnj7f9@hotmail.com  2019-02-01   
3    p6x3hskt@gmail.com    rn9xj56s@yahoo.com  2020-02-01   
4    kyni1nhy@gmail.com    afn53n7r@gmail.com  2022-01-21   

                 subject                                               body  \
0  Urgent Account Notice  Greetings,\n\nCongratulations! You are eligibl...   
1    Your Account Update  Dear Valued Customer,\n\nWe hope you're enjoyi...   
2  Urgent Account Notice  Attention,\n\nYour account password has been c...   
3    Your Account Update  Greetings,\n\nThank you for subscribing to our...   
4    Your Account Update  Hi,\n\nWe noticed you changed your account set...   

                                             urls  label  
0                  http://wyyrdcefgl.duckdns.org/      1  
1                          https://examp