In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
from sklearn.utils import shuffle
import joblib
from sklearn.feature_extraction import text


stop_words = text.ENGLISH_STOP_WORDS


def preprocess_email(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


df_emails = pd.DataFrame({
    'text': [
        "Hello, meeting at 10am",
        "You have won $1000, click here!",
        "Verify your account now",
        "Your Amazon order has shipped",
        "Update your bank details immediately",
        "Lunch at 1pm?",
        "Congratulations, you are selected for a prize",
        "Please see attached invoice",
        "Reset your password now",
        "Let's catch up tomorrow",
        "Urgent: Your account is compromised",
        "Project deadline extended",
        "Claim your free vacation now",
        "Team meeting rescheduled",
        "Suspicious login detected in your account",
        "Introduction to Internet of Things - Week 07 Feedback Form",
        "Introduction to Internet of Things - Week 07 content is live now!!",
        "Introduction to Internet of Things",  
        "Welcome to the IoT course",
        "Your OTP is 123456",
        "Bank account suspended, verify now",
        "Invoice for your recent purchase",
        "Congratulations! You have been selected",
        "Let's meet for coffee tomorrow",
        "Your password has been reset",
        "Urgent: Update your payment information",
        "Reminder: Team meeting at 3pm",
        "Claim your free gift card now",
        "Security alert: Suspicious activity detected",
        "Course update: New content available",
        "Feedback requested for IoT module"
    ],
    'label': [
        0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,0
    ]  
})



df_emails = shuffle(df_emails, random_state=42).reset_index(drop=True)


df_emails['text_clean'] = df_emails['text'].apply(preprocess_email)


X_train_text, X_val_text, y_train, y_val = train_test_split(
    df_emails['text_clean'], df_emails['label'],
    test_size=0.2, stratify=df_emails['label'], random_state=42
)


vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(X_train_text)
X_val = vectorizer.transform(X_val_text)


model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)


y_val_proba = model.predict_proba(X_val)[:, 1]


best_thresh = 0.5
best_recall = 0.0
for thresh in np.linspace(0, 1, 101):
    y_pred_thresh = (y_val_proba >= thresh).astype(int)
    recall = recall_score(y_val, y_pred_thresh)
    if recall > best_recall:
        best_recall = recall
        best_thresh = thresh

print(f"Selected threshold: {best_thresh:.2f} with recall {best_recall:.2f}")


y_pred = (y_val_proba >= best_thresh).astype(int)


print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification report:\n", classification_report(y_val, y_pred, target_names=["Ham","Unsafe"]))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))


joblib.dump(model, 'binary_email_classifier.pkl')
joblib.dump(vectorizer, 'binary_email_tfidf_vectorizer.pkl')





SyntaxError: invalid syntax. Perhaps you forgot a comma? (3949779752.py, line 25)