In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("advaithsrao/enron-fraud-email-dataset")

print("Path to dataset files:", path)

BackendError: POST failed with: {"errors":["Unauthenticated"],"error":{"code":16,"details":[]},"wasSuccessful":false}

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv(os.path.join(path, 'enron_data_fraud_labeled.csv'))  # Update `path` to your dataset's location
df.head()

# Step 1: Clean and preprocess the data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_email(email):
    # Check for missing values before processing
    if pd.isnull(email):
        return ""
    # Remove non-alphabetic characters and stopwords
    email = ' '.join([word for word in email.split() if word.isalpha() and word.lower() not in stop_words])
    return email

df['cleaned_body'] = df['Body'].apply(preprocess_email)  # Adjust column name if necessary

# Encode labels (assuming 'Label' column contains target labels like 'spam' or 'ham')
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['Label'])

# Step 2: Tokenization and padding
# Define features and target
X = df['cleaned_body'].fillna("")  # Replace NaN with empty strings
y = df['label_encoded']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize text data
tokenizer = Tokenizer(num_words=10000)  # Adjust vocabulary size as needed
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
max_length = 100  # Adjust max length based on your dataset
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Step 3: Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),  # Embedding layer
    Bidirectional(LSTM(64, return_sequences=False)),  # LSTM layer
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Step 4: Train the model
print("Starting training...")
history = model.fit(
    X_train_padded,
    y_train,
    validation_split=0.2,
    epochs=5,  # Adjust epochs based on dataset size
    batch_size=32,  # Adjust batch size based on your hardware
    verbose=1
)




In [None]:
# Step 5: Evaluate the model
print("Evaluating the model...")
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Generate a classification report
# Generate a classification report
# Ensure target names are strings


In [None]:
print("Original class names:", label_encoder.classes_)
class_names = [str(cls) for cls in label_encoder.classes_]  # Ensure class names are strings

# Generate predictions
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32").flatten()

# Generate classification report
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
# Save the model (optional)
model.save('email_classification_model.h5')