In [6]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [10]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('emails.csv')

# Display dataset info
print(df.head(), '\n')
print("Shape of the dataframe:", df.shape, '\n')
print("Columns of the dataframe:", df.columns, '\n')
print("Summary of the data:")
df.info()

# Ensure text is string and clean
df['text'] = df['text'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Remove stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Tokenization
df['tokenized_text'] = df['text'].apply(word_tokenize)

# POS tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
df['pos_tagged'] = df['tokenized_text'].apply(nltk.pos_tag)

# Function to map POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['pos_tagged'].apply(
    lambda tagged_words: [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_words]
)

# Convert list of words into a single string per document
df['processed_text'] = df['lemmatized_text'].apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
X = vectorizer.fit_transform(df['processed_text'])
print("Selected features:", vectorizer.get_feature_names_out())

# Extract labels
y = df['spam']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

# Hyperparameter tuning for Logistic Regression and Random Forest
param_grid = {
    "Logistic Regression": {'C': [0.1, 1, 10]},
    "Random Forest": {'n_estimators': [50, 100, 200]}
}

best_models = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    if model_name in param_grid:
        grid_search = GridSearchCV(model, param_grid[model_name], cv=3, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_models[model_name] = model

# Evaluate models
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")


                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1 

Shape of the dataframe: (5728, 2) 

Columns of the dataframe: Index(['text', 'spam'], dtype='object') 

Summary of the data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Selected features: ['2000' 'cc' 'com' 'ect' 'enron' 'hou' 'kaminski' 'pm' 'subject' 'vince']
Training Naive Bayes...
Training Logistic Regression...
Training SVM...
Training Random Forest...
Naive Bayes Accuracy: 0.8944
Logistic Regression Accuracy: 0.8953
SVM Accuracy: 0.8944
Random Forest Accuracy: 0.8927
