**TF-IDF & Random Forest**

In [2]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
file_path = "/content/Augmented Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
df['Cleaned_Content'] = df['Email Content'].apply(clean_text)

In [7]:
X = df['Cleaned_Content']
y = df['Category']

In [8]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.00
Classification Report:
                             precision    recall  f1-score   support

                         -       1.00      1.00      1.00      1053
                  Business       1.00      1.00      1.00         9
                   Finance       1.00      1.00      1.00         1
        Order Confirmation       1.00      1.00      1.00         2
Order Confirmation/Updates       1.00      0.67      0.80         6
                  Personal       1.00      1.00      1.00         3
               Promotional       1.00      1.00      1.00         7
             Subscriptions       1.00      1.00      1.00         1
    Transaction & Security       1.00      1.00      1.00         3

                  accuracy                           1.00      1085
                 macro avg       1.00      0.96      0.98      1085
              weighted avg       1.00      1.00      1.00      1085



In [13]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE, RandomOverSampler


file_path = "/content/Augmented Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)


df = df[df['Category'] != '-']

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
    return text


df['Cleaned_Content'] = df['Email Content'].apply(clean_text)


X = df['Cleaned_Content']
y = df['Category']


vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(X)


if y.value_counts().min() < 2:
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
    print("Warning: A class has only one sample, stratification is disabled.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)


class_counts = y_train.value_counts()
min_class_count = class_counts.min()


if min_class_count < 2:
    print("Using RandomOverSampler due to extremely small class sizes.")
    resampler = RandomOverSampler(random_state=42)
else:
    print("Using SMOTE for balancing.")
    resampler = SMOTE(random_state=42, k_neighbors=min(2, min_class_count - 1))


X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)


model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
model.fit(X_train_resampled, y_train_resampled)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Using SMOTE for balancing.
Accuracy: 0.94
Classification Report:
                             precision    recall  f1-score   support

                  Business       0.80      1.00      0.89         8
                   Finance       1.00      1.00      1.00         2
        Order Confirmation       1.00      1.00      1.00         1
Order Confirmation/Updates       1.00      1.00      1.00         4
                  Personal       1.00      1.00      1.00         6
               Promotional       1.00      0.75      0.86         8
    Transaction & Security       1.00      1.00      1.00         4
                   Updates       1.00      1.00      1.00         1

                  accuracy                           0.94        34
                 macro avg       0.97      0.97      0.97        34
              weighted avg       0.95      0.94      0.94        34



In [14]:
import pandas as pd


file_path = "/content/Augmented Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)

df['Cleaned_Content'] = df['Email Content'].str.replace('-', '', regex=False)

df.to_csv("/content/Augmented Cleaned Smart Email Dataset.csv", index=False)

print(df[['Email Content', 'Cleaned_Content']].head())

                                       Email Content  \
0  [image: Google]\nArchive of Google data reques...   
1  [image: Google]\nArchive of Google data reques...   
2  ----------------------------------------------...   
3  ----------------------------------------------...   
4  CarGurus Check out these listings near you ---...   

                                     Cleaned_Content  
0  [image: Google]\nArchive of Google data reques...  
1  [image: Google]\nArchive of Google data reques...  
2  \nTemu\n\n\nTo properly view the full message ...  
3  \nTemu\n\n\nTo properly view the full message ...  
4  CarGurus Check out these listings near you  Ca...  


In [15]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE


file_path = "/content/Augmented Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        text = text.replace("-", "")
    return text

df['Cleaned_Content'] = df['Email Content'].apply(clean_text)


def categorize_email(content):
    if any(word in content for word in ['discount', 'offer', 'sale', 'deal', 'promo', 'limited', 'coupon']):
        return 'Promotional'
    elif any(word in content for word in ['meeting', 'project', 'client', 'work', 'business', 'conference', 'office']):
        return 'Business'
    elif any(word in content for word in ['transaction', 'security', 'password', 'verification', 'login', 'fraud', 'alert']):
        return 'Transaction & Security'
    elif any(word in content for word in ['family', 'friend', 'personal', 'vacation', 'birthday', 'social']):
        return 'Personal'
    elif any(word in content for word in ['finance', 'bank', 'loan', 'credit', 'investment', 'money', 'billing']):
        return 'Finance'
    elif any(word in content for word in ['update', 'change', 'news', 'announcement', 'latest', 'info']):
        return 'Updates'
    elif any(word in content for word in ['subscription', 'renewal', 'membership', 'subscribe', 'unsubscribed']):
        return 'Subscription'
    else:
        return 'Updates'


df['Category'] = df['Cleaned_Content'].apply(categorize_email)

X = df['Cleaned_Content']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


min_class_count = y_train.value_counts().min()
if min_class_count > 1:
    smote = SMOTE(random_state=42, k_neighbors=min(5, min_class_count - 1))
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
else:
    print("Class imbalance too severe. Skipping SMOTE.")
    X_train_resampled, y_train_resampled = X_train_tfidf, y_train


model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
model.fit(X_train_resampled, y_train_resampled)


y_pred = model.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.98
Classification Report:
                         precision    recall  f1-score   support

              Business       1.00      0.96      0.98       190
               Finance       1.00      0.92      0.96        26
              Personal       1.00      0.95      0.97        39
           Promotional       0.99      0.99      0.99       456
          Subscription       0.92      0.94      0.93        72
Transaction & Security       0.96      0.98      0.97       108
               Updates       0.95      1.00      0.97       194

              accuracy                           0.98      1085
             macro avg       0.97      0.96      0.97      1085
          weighted avg       0.98      0.98      0.98      1085



 **LSTM**

In [16]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

file_path = "/content/Augmented Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)


def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df['Cleaned_Content'] = df['Email Content'].fillna("").apply(clean_text)


label_encoder = LabelEncoder()
df['Category_Label'] = label_encoder.fit_transform(df['Category'])


X = df['Cleaned_Content'].tolist()
y = df['Category_Label'].values


max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='post', truncating='post')


X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


# Define the LSTM model
# Removed the duplicated model definition and corrected indentation
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


epochs = 5
batch_size = 32

history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    verbose=1
)


loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


model_path = "/mnt/data/lstm_email_classifier.h5"
model.save(model_path)
print(f"LSTM model saved to {model_path}")

Epoch 1/5




[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 324ms/step - accuracy: 0.9281 - loss: 0.6927 - val_accuracy: 0.9705 - val_loss: 0.1820
Epoch 2/5
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 342ms/step - accuracy: 0.9703 - loss: 0.1834 - val_accuracy: 0.9705 - val_loss: 0.1823
Epoch 3/5
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 309ms/step - accuracy: 0.9645 - loss: 0.2103 - val_accuracy: 0.9705 - val_loss: 0.1745
Epoch 4/5
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 297ms/step - accuracy: 0.9732 - loss: 0.1578 - val_accuracy: 0.9705 - val_loss: 0.1744
Epoch 5/5
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 307ms/step - accuracy: 0.9709 - loss: 0.1662 - val_accuracy: 0.9705 - val_loss: 0.1626
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9749 - loss: 0.1391




Test Accuracy: 0.97
LSTM model saved to /mnt/data/lstm_email_classifier.h5
