In [17]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 2: Expanded dataset of emails (subject, body, and label indicating phishing or not)
data = {
    'subject': [
        'Congratulations! You won a $1,000 Gift Card',
        'Your account has been compromised',
        'Meeting reminder: Project Updates',
        'Reset your password immediately',
        'Invoice for your recent purchase',
        'Claim your prize now before it expires',
        'Your subscription is about to expire',
        'Urgent: Verify your account details',
        'Win a free vacation to Hawaii!',
        'Important: Update your payment information',
        'Reminder: Your package is arriving soon',
        'Verify your account for added security',
        'Exclusive offer: Get 50% off today!',
        'Your loan pre-approval is ready',
        'Alert: Suspicious activity on your card',
        'Special promotion: Get a new credit card',
        'Activate your new bank account now',
        'Work from home: Earn $5,000 monthly',
        'Download the latest software update for free',
        'Immediate action required: Account suspended',
        'Invitation: Join us for a networking event',
        'Important security update for your account',
        'You have been selected for a cash prize',
        'Survey: Win an Amazon gift card for participating',
        'Update your contact information now to avoid disruptions',
        'Appointment reminder',
        'Your meeting is scheduled for tomorrow',
        'Reminder for your doctor appointment',
        'Reminder: Your annual subscription renewal',
        'Meeting reminder: Monthly team sync',
        'Notification: Your online order has shipped'
    ],
    'body': [
        'Click here to claim your prize. Act fast!',
        'We noticed unusual login attempts. Please verify your identity.',
        'This is a reminder for our project update meeting tomorrow.',
        'Someone tried to log in to your account. Reset your password now.',
        'Please find attached the invoice for your recent purchase.',
        'You are the lucky winner of a $1,000 Gift Card. Click the link to claim.',
        'Your subscription is about to expire. Renew now to avoid interruptions.',
        'We need to verify your account details. Please update them here.',
        'You have been selected to win a free vacation! Just confirm your details.',
        'Your payment information is out of date. Update it to avoid service interruptions.',
        'Your package is on the way! Track your shipment with the link provided.',
        'For added security, please verify your account. Click here to proceed.',
        'Limited time only! Get 50% off all products. Shop now!',
        'Your loan pre-approval is ready. Click here to finalize the process.',
        'We have detected suspicious activity on your card. Review your transactions here.',
        'Apply today for a new credit card with low interest rates.',
        'Activate your new bank account by clicking the link and verifying your details.',
        'Work from home and earn up to $5,000 monthly. Sign up now!',
        'Update your software now to protect against the latest security threats. Free download!',
        'Your account has been suspended due to unusual activity. Verify your identity to restore access.',
        'Join us for a networking event next week. RSVP today!',
        'An important security update is available for your account. Click to update now.',
        'You’ve been selected to win a cash prize. Confirm your information to claim it.',
        'Complete our survey and win an Amazon gift card. Participation is free!',
        'Update your contact details now to avoid losing access to your account.',
        'This is a reminder for your upcoming appointment tomorrow.',
        'Your meeting is scheduled for tomorrow at 2 PM. Please confirm attendance.',
        'This is a reminder for your appointment with Dr. Smith tomorrow at 3 PM.',
        'Your annual subscription will renew tomorrow. No action needed if you wish to continue.',
        'This is a reminder for the monthly team sync tomorrow at 10 AM.',
        'Your online order has shipped. Track your package here.'
    ],
    'label': [
        1, 1, 0, 1, 0, 1, 0, 1,  # First batch
        1, 1, 0, 1, 0, 1, 0, 1,  # Second batch
        1, 1, 0, 1, 0, 1, 0, 1,  # Third batch
        0, 0, 0, 0, 0, 0, 0  # Additional non-phishing examples
    ]  # 1: Phishing, 0: Non-Phishing
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Step 3: Preprocessing - Combine subject and body
df['text'] = df['subject'] + ' ' + df['body']

# Step 4: Train-Test Split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 5: Vectorize the text data using TF-IDF with increased max_features
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 6: Train a Logistic Regression model with class_weight balanced
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test_tfidf)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)  # zero_division=1 to handle 0 precision cases
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Step 9: User input for email subject and body
new_email_subject = input("Enter the email subject: ")
new_email_body = input("Enter the email body: ")
new_email = [new_email_subject + ' ' + new_email_body]

# Step 10: Vectorize and predict the new email
new_email_tfidf = tfidf.transform(new_email)
prediction = model.predict(new_email_tfidf)

if prediction[0] == 1:
    print("This email is likely a phishing attempt.")
else:
    print("This email is likely not a phishing attempt.")


Accuracy: 0.70
Precision: 0.75
Recall: 0.60
F1 Score: 0.67
Enter the email subject: appointment reminder from Dr. Wilson
Enter the email body: You have an appointment tomorrow with  Dr. Wilson. If you need to reschedule, please call our office.
This email is likely not a phishing attempt.
