# 1. Import required libraries



In [9]:
import pandas as pd                                      # for data loading & manipulation
from collections import Counter
import re
from sklearn.model_selection import train_test_split     # to split data into train/test
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  
                                                       # to convert text → numeric features
from sklearn.naive_bayes import MultinomialNB           # Naïve Bayes classifier
from sklearn.linear_model import LogisticRegression     # Logistic Regression classifier
from sklearn.svm import LinearSVC                        # Support Vector Machine
from sklearn.ensemble import RandomForestClassifier      # Random Forest classifier
from sklearn.metrics import (accuracy_score,             # for measuring model performance
                             classification_report,
                             confusion_matrix)


# 2. Load the dataset


In [10]:
df = pd.read_csv('combined_data.csv')          # assumes your CSV sits in data/raw/
                                                       # and has columns "label" and "text"


# 3. Quick peek at size & first rows


In [11]:
print("Shape (rows, cols):", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Shape (rows, cols): (83448, 2)

First 5 rows:
   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...


# 4. Check for missing or null values

In [12]:
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
label    0
text     0
dtype: int64


# 5. Class balance: how many spam vs. ham?

In [13]:
print("\nLabel distribution:")
print(df['label'].value_counts())


Label distribution:
label
1    43910
0    39538
Name: count, dtype: int64


# 6. Add basic text‐length features


In [15]:
df['char_count'] = df['text'].str.len()               # number of characters
df['word_count'] = df['text'].str.split().str.len()   # number of words
print("\nText length summary (chars and words):")
print(df[['char_count','word_count']].describe())


Text length summary (chars and words):
          char_count     word_count
count   83448.000000   83448.000000
mean     1662.952725     282.811775
std      4178.578068     724.818152
min         1.000000       1.000000
25%       449.000000      80.000000
50%       879.000000     152.000000
75%      1861.000000     312.000000
max    598705.000000  101984.000000


# 7. Top 10 most frequent words (simple stop‐word removal)


In [16]:
texts = (
    df['text']
    .str.lower()                                                   # lowercase
    .str.replace(r'[^a-z\s]', ' ', regex=True)                    # remove punctuation/numbers
    .str.split()                                                  # split into words
    .explode()                                                    # flatten to one word per row
)

stopwords = {
    'the','to','and','of','a','in','for','is','on','you','your',
    'this','i','it','we','that','with','our','be','are','from'
}

filtered = texts[~texts.isin(stopwords) & (texts.str.len() > 2)]
top10 = Counter(filtered).most_common(10)

print("\nTop 10 words (after basic cleaning):")
for word, cnt in top10:
    print(f"  {word!r}: {cnt}")



Top 10 words (after basic cleaning):
  'escapenumber': 1148051
  'escapelong': 227296
  'have': 85762
  'http': 82235
  'not': 76839
  'will': 70320
  'com': 69818
  'all': 57574
  'enron': 52861
  'can': 50285


# 8. Show 3 example messages of each class


In [19]:
df['label_str'] = df['label'].map({1: 'spam', 0: 'ham'})
print(df['label_str'].unique())  # ['spam', 'ham']
# Then your original code will work:
df[df['label_str']=='spam']['text'].sample(3, random_state=42)


['spam' 'ham']


73411    wireless optical\n3 - button scroll mouse\nlim...
27342    from the desk of philip moore\ndept credit con...
27216    submitting your website in search engines may ...
Name: text, dtype: object

# 9. Prepare inputs (X) and target (y)


In [20]:
X = df['text']                                         # X is the raw email text
y = df['label']                                        # y is either 'spam' or 'ham'


In [21]:
df.head()

Unnamed: 0,label,text,char_count,word_count,label_str
0,1,ounce feather bowl hummingbird opec moment ala...,148,20,spam
1,1,wulvob get your medircations online qnb ikud v...,808,103,spam
2,0,computer connection from cnn com wednesday es...,2235,337,ham
3,1,university degree obtain a prosperous future m...,592,76,spam
4,0,thanks for all your answers guys i know i shou...,1362,222,ham


# 10. Split into train/test sets


In [22]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,        # 20% of data for testing
    stratify=y,            # preserve the spam/ham ratio in both splits
    random_state=42        # for reproducibility
)

# 11. Choose your vectorizer (uncomment one):


In [23]:
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             token_pattern=r'\b\w+\b')
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english',
                             token_pattern=r'\b\w+\b')

In [24]:
df.head()

Unnamed: 0,label,text,char_count,word_count,label_str
0,1,ounce feather bowl hummingbird opec moment ala...,148,20,spam
1,1,wulvob get your medircations online qnb ikud v...,808,103,spam
2,0,computer connection from cnn com wednesday es...,2235,337,ham
3,1,university degree obtain a prosperous future m...,592,76,spam
4,0,thanks for all your answers guys i know i shou...,1362,222,ham


# 12. Fit the vectorizer on training text, then transform both train & test


In [25]:
X_train = vectorizer.fit_transform(X_train_raw)        # learns vocabulary & transforms train
X_test  = vectorizer.transform(X_test_raw)             # transforms test using same vocab


# 13. Define your classifiers


In [27]:
models = {
    'Naive Bayes':        MultinomialNB(alpha=1.0),     # Laplace smoothing α=1
    'Logistic Regression': LogisticRegression(max_iter=1000, n_jobs=-1),
    'Linear SVM':         LinearSVC(max_iter=10000),
    'Random Forest':      RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
    }

# 14. Train each model & evaluate


In [29]:
for name, clf in models.items():
    print(f"\n=== {name} ===")
    clf.fit(X_train, y_train)                         # train on the TF-IDF/Count features
    y_pred = clf.predict(X_test)                      # predict labels on test set

    # 8a. Accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")

    # 8b. Precision / Recall / F1
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    # 8c. Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))



=== Naive Bayes ===
Accuracy: 0.9766

Classification Report:
              precision    recall  f1-score   support

           0     0.9642    0.9872    0.9756      7908
           1     0.9882    0.9670    0.9775      8782

    accuracy                         0.9766     16690
   macro avg     0.9762    0.9771    0.9765     16690
weighted avg     0.9768    0.9766    0.9766     16690

Confusion Matrix:
[[7807  101]
 [ 290 8492]]

=== Logistic Regression ===
Accuracy: 0.9859

Classification Report:
              precision    recall  f1-score   support

           0     0.9923    0.9779    0.9850      7908
           1     0.9803    0.9932    0.9867      8782

    accuracy                         0.9859     16690
   macro avg     0.9863    0.9855    0.9859     16690
weighted avg     0.9860    0.9859    0.9859     16690

Confusion Matrix:
[[7733  175]
 [  60 8722]]

=== Linear SVM ===
Accuracy: 0.9916

Classification Report:
              precision    recall  f1-score   support

        