In [87]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')

# Data Preparation

In [46]:
df = pd.read_csv("../data/Spam Email raw text for NLP.csv")
df

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CATEGORY   5796 non-null   int64 
 1   MESSAGE    5796 non-null   object
 2   FILE_NAME  5796 non-null   object
dtypes: int64(1), object(2)
memory usage: 136.0+ KB


In [48]:
type_mapping = {
    'MESSAGE': 'string',
    'FILE_NAME': 'string'
}
df = df.astype(type_mapping)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CATEGORY   5796 non-null   int64 
 1   MESSAGE    5796 non-null   string
 2   FILE_NAME  5796 non-null   string
dtypes: int64(1), string(2)
memory usage: 136.0 KB


In [49]:
df['CATEGORY'].value_counts()

CATEGORY
0    3900
1    1896
Name: count, dtype: int64

In [50]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/fawad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/fawad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
tokenizer = nltk.RegexpTokenizer(r"\w+")

sentences = "HEY <LADIES>, <drop> it down. Just want to see you touch the ground. Don't be shy girl, go <BONANZA>. Shake your body like a belly dancer"

tokenized_sentences = tokenizer.tokenize(sentences)
sentences, tokenized_sentences

("HEY <LADIES>, <drop> it down. Just want to see you touch the ground. Don't be shy girl, go <BONANZA>. Shake your body like a belly dancer",
 ['HEY',
  'LADIES',
  'drop',
  'it',
  'down',
  'Just',
  'want',
  'to',
  'see',
  'you',
  'touch',
  'the',
  'ground',
  'Don',
  't',
  'be',
  'shy',
  'girl',
  'go',
  'BONANZA',
  'Shake',
  'your',
  'body',
  'like',
  'a',
  'belly',
  'dancer'])

In [52]:
sentences_lower_cased = [t.lower() for t in tokenized_sentences]
sentences_lower_cased

['hey',
 'ladies',
 'drop',
 'it',
 'down',
 'just',
 'want',
 'to',
 'see',
 'you',
 'touch',
 'the',
 'ground',
 'don',
 't',
 'be',
 'shy',
 'girl',
 'go',
 'bonanza',
 'shake',
 'your',
 'body',
 'like',
 'a',
 'belly',
 'dancer']

In [53]:
from nltk import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_sentences = [wordnet_lemmatizer.lemmatize(token) for token in sentences_lower_cased]
lemmatized_sentences

['hey',
 'lady',
 'drop',
 'it',
 'down',
 'just',
 'want',
 'to',
 'see',
 'you',
 'touch',
 'the',
 'ground',
 'don',
 't',
 'be',
 'shy',
 'girl',
 'go',
 'bonanza',
 'shake',
 'your',
 'body',
 'like',
 'a',
 'belly',
 'dancer']

In [54]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
useful_tokens = [stopword for stopword in lemmatized_sentences if stopword not in stopwords_list]
useful_tokens

['hey',
 'lady',
 'drop',
 'want',
 'see',
 'touch',
 'ground',
 'shy',
 'girl',
 'go',
 'bonanza',
 'shake',
 'body',
 'like',
 'belly',
 'dancer']

In [55]:
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

def message_tokenizer(message):
    reg_tokenizer = RegexpTokenizer(r'\w+')
    tokenized_message = reg_tokenizer.tokenize(message)
    lower_cased = [t.lower() for t in tokenized_message]
    stop_words = stopwords.words('english')

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [wordnet_lemmatizer.lemmatize(lower_case) for lower_case in lower_cased]

    required_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    return required_tokens

In [56]:
message_tokenizer(sentences)

['hey',
 'lady',
 'drop',
 'want',
 'see',
 'touch',
 'ground',
 'shy',
 'girl',
 'go',
 'bonanza',
 'shake',
 'body',
 'like',
 'belly',
 'dancer']

# Train/Test split

In [57]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8)
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  
 
 <HTML><FONT  BACK="#ffffff" style="BACKGROUN...   
 1            1  <html><body bgColor="#CCCCCC" topmargin=1 onMo...   
 2            0  Quoting Paul Linehan (plinehan@yahoo.com):
 
 
 
 ...   
 3            0  <a href=http://www.aaronsw.com/weblog/>
 
 Aaron...   
 4            0  Oh yeah, the link for more info:
 
 
 
 http://www...   
 ...        ...                                                ...   
 4631         0  Gregory Alan Bolcer:
 
 >I'm not sure since I ha...   
 4632         1  New Account For: zzzz@spamassassin.taint.org
 
 ...   
 4633         0  >>>>> "O" == Owen Byrne <owen@permafrost.net> ...   
 4634         0  This is an automated response to a message you...   
 4635         0  http://www.ouchytheclown.com/welcome.html
 
 
 
 
 ...   
 
                                    FILE_NAME  
 0     00118.141d803810acd9d4fc23db103dddfcd9  
 1     00463.0bc4e08af0529dd773d9f10f9225

In [90]:
train_df['CATEGORY'].value_counts()

CATEGORY
0    3112
1    1524
Name: count, dtype: int64

In [91]:
test_df['CATEGORY'].value_counts()

CATEGORY
0    788
1    372
Name: count, dtype: int64

# Data preprocessing

In [58]:
token_count = {}

for message in train_df['MESSAGE']:
    tokenized_message = message_tokenizer(message)

    for token in tokenized_message:
        if token in token_count:
            token_count[token] += 1
        else:
            token_count[token] = 1

token_count, len(token_count)

({'html': 4175,
  'font': 35005,
  'back': 1055,
  'ffffff': 2535,
  'style': 3349,
  'background': 789,
  'color': 9642,
  'size': 13107,
  '3': 3581,
  'ptsize': 450,
  '12': 985,
  'b': 12856,
  'viagra': 66,
  '000000': 1923,
  '2': 7993,
  '10': 2182,
  'family': 1491,
  'sansserif': 314,
  'face': 9950,
  'arial': 6187,
  'lang': 419,
  '0': 9445,
  'br': 16013,
  'breakthrough': 22,
  'medication': 50,
  'impotence': 13,
  'delivered': 79,
  'mailbox': 71,
  'without': 658,
  'leaving': 50,
  'computer': 640,
  'simply': 377,
  'click': 2144,
  'href': 3875,
  'http': 14926,
  'host': 158,
  '1bulk': 12,
  'email': 4015,
  'software': 1129,
  'com': 11675,
  'ch4': 12,
  'pharm': 12,
  'blue': 181,
  'le': 680,
  '5': 2932,
  'minute': 366,
  'complete': 403,
  'line': 1307,
  'consultation': 68,
  'many': 1004,
  'case': 681,
  '24': 575,
  'nbsp': 9732,
  'hour': 589,
  'gt': 108,
  'website': 488,
  'treatment': 33,
  'compromised': 12,
  'sexual': 120,
  'function': 202,
  '

In [59]:
def keep_token(processed_token, threshold=10000):
    if processed_token not in token_count:
        return False
    else:
        return token_count[processed_token] > threshold

In [60]:
keep_token('quick', 100)

True

In [61]:
features = set()

for token in token_count:
    if keep_token(token, 9981) == True:
        features.add(token)

features = list(features)
features

['com', 'td', 'b', 'http', 'p', '3d', 'font', 'tr', 'br', 'size']

In [62]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'com': 0,
 'td': 1,
 'b': 2,
 'http': 3,
 'p': 4,
 '3d': 5,
 'font': 6,
 'tr': 7,
 'br': 8,
 'size': 9}

In [63]:
message_tokenizer('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

## Bag of word approach

**"Bag of Words" (count vector)**

**-> T_s = [http  tr  size  3d  font  br  com  td   p   b]**

**-> I_s = [0      1    2    3    4    5    6   7   8   9]**

**-> V_s = [0,   0,   0,   1,  2,   1,   2,   0,  0,  1]**

*Res*: `[0.,  0.,  0.,   1., 2.,  1., 2.,  0., 0., 1.]`

In [64]:
import numpy as np

def message_to_count_vector(message):
    count_vector = np.zeros(len(features))
    useful_tokens = message_tokenizer(message)
    for token in useful_tokens:
        if token not in features:
            continue
        token_index = token_to_index_mapping[token]
        count_vector[token_index] += 1
    return count_vector.astype('int64')

In [65]:
message_to_count_vector(train_df['MESSAGE'].iloc[3])

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [66]:
message_to_count_vector(train_df['MESSAGE'].iloc[10])

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [67]:
def df_to_X_Y(df: pd.DataFrame):
    y = df['CATEGORY'].to_numpy().astype('int64')
    X = []
    for message in df['MESSAGE']:
        count_vector = message_to_count_vector(message=message)
        X.append(count_vector)
    return np.array(X).astype(int), np.array(y).astype(int)

In [68]:
X_train, y_train = df_to_X_Y(df=train_df)
X_test, y_test = df_to_X_Y(df=test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4636, 10), (4636,), (1160, 10), (1160,))

## Scaling the `X_train` and `X_test`

In [69]:
from sklearn.preprocessing import StandardScaler

s_scaler = StandardScaler()
X_train_scaled, X_test_scaled = s_scaler.fit_transform(X_train), s_scaler.fit_transform(X_test)
X_train_scaled, X_test_scaled

(array([[ 0.84303159, -0.2408693 , -0.077147  , ..., -0.26232183,
          1.37277016, -0.07924315],
        [-0.06741672,  0.00967368,  0.92077014, ..., -0.14917297,
          0.86168575,  0.39972848],
        [-0.19748076, -0.2408693 , -0.27673043, ..., -0.26232183,
         -0.16048307, -0.2708318 ],
        ...,
        [-0.06741672, -0.2408693 , -0.27673043, ..., -0.26232183,
         -0.16048307, -0.2708318 ],
        [-0.06741672, -0.2408693 , -0.27673043, ..., -0.26232183,
         -0.16048307, -0.2708318 ],
        [ 0.19271137, -0.2408693 , -0.27673043, ..., -0.26232183,
         -0.16048307, -0.2708318 ]]),
 array([[-0.08846002, -0.24243477, -0.28549788, ..., -0.25474501,
         -0.16872934, -0.14736084],
        [-0.2626767 , -0.24243477, -0.28549788, ..., -0.25474501,
         -0.16872934, -0.14736084],
        [-0.08846002, -0.24243477, -0.28549788, ..., -0.25474501,
         -0.16872934, -0.14736084],
        ...,
        [-0.17556836, -0.24243477, -0.28549788, ..., -

# Training classification models

In [92]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svc = SVC(kernel='linear', gamma='scale', class_weight='balanced')
svc.fit(X=X_train_scaled, y=y_train)
predictions_svc = svc.predict(X=X_test_scaled)
print(classification_report(y_pred=predictions_svc, y_true=y_test))

              precision    recall  f1-score   support

           0       0.81      0.99      0.90       788
           1       0.98      0.52      0.68       372

    accuracy                           0.84      1160
   macro avg       0.90      0.76      0.79      1160
weighted avg       0.87      0.84      0.83      1160



In [93]:
svc = SVC(kernel='rbf', gamma='scale', class_weight='balanced')
svc.fit(X=X_train_scaled, y=y_train)
predictions_svc = svc.predict(X=X_test_scaled)
print(classification_report(y_pred=predictions_svc, y_true=y_test))

              precision    recall  f1-score   support

           0       0.81      1.00      0.90       788
           1       0.98      0.51      0.67       372

    accuracy                           0.84      1160
   macro avg       0.90      0.75      0.79      1160
weighted avg       0.87      0.84      0.82      1160



In [94]:
from sklearn.linear_model import LogisticRegression

l_regression = LogisticRegression(penalty='l2', random_state=1, solver='saga', class_weight='balanced')
pred_logistic = l_regression.fit(X=X_train_scaled, y=y_train).predict(X_test_scaled)
print(classification_report(y_pred=pred_logistic, y_true=y_test))

              precision    recall  f1-score   support

           0       0.82      0.99      0.90       788
           1       0.98      0.53      0.69       372

    accuracy                           0.85      1160
   macro avg       0.90      0.76      0.79      1160
weighted avg       0.87      0.85      0.83      1160



In [116]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_jobs=300,
    criterion='log_loss',
    class_weight='balanced',
    random_state=1,
    max_features=len(features),
    warm_start=True,
).fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       788
           1       0.81      0.65      0.72       372

    accuracy                           0.84      1160
   macro avg       0.83      0.79      0.80      1160
weighted avg       0.83      0.84      0.83      1160



If you see it in all of those, you can see the class imbalance. Therefore, there has to be augmented data for non-spam (1) category.

## Balanced data model training

We'll create synthetic data by generating new samples based on the existing data points in the minority class. The **SMOTE (Synthetic Minority Over-sampling Technique)** algorithm is popular for generating synthetic data by interpolating between existing minority class samples.

In [100]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
X_test_balanced, y_test_balanced = smote.fit_resample(X_test_scaled, y_test)

l_regression = LogisticRegression(penalty='l2', random_state=1, solver='saga', class_weight='balanced')
pred_logistic_smote = l_regression.fit(X=X_train_balanced, y=y_train_balanced).predict(X_test_balanced)

print(classification_report(y_pred=pred_logistic_smote, y_true=y_test_balanced))

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       788
           1       0.99      0.52      0.68       788

    accuracy                           0.76      1576
   macro avg       0.83      0.76      0.74      1576
weighted avg       0.83      0.76      0.74      1576



It looks like use of SMOTE to balance the classes had a noticeable impact on model's performance. Here’s what the output tells us:

### Observations:
- **Precision and Recall Trade-off**: Your recall for class `0` (majority class) is very high (0.99), indicating that the model is very good at identifying `0` instances. However, the precision for class `0` has decreased to 0.67, which means that when the model predicts `0`, it's correct about 67% of the time.
- **Class `1` (minority class) Performance**: The precision for class `1` has improved to 0.99, which is excellent, but the recall is lower at 0.52. This indicates that while the model is very good at identifying when it predicts class `1`, it misses half of the actual class `1` instances.
- **Overall Accuracy**: Your model's accuracy has dropped to 0.76, which may be a result of the imbalance correction affecting how well it generalizes to both classes.

### Key Takeaways:
1. **SMOTE has helped with balancing** the dataset by creating synthetic instances, improving the model's ability to identify the minority class (`1`) when it's predicted.
2. **Precision and recall** are not in perfect balance; improving one reduces the other, so consider the specific use case and which metric matters more for your objectives.
3. **Overfitting Potential**: You may be at risk of overfitting due to the synthetic data generated by SMOTE. This is especially true if the number of synthetic samples is high.

### Recommendations for Improvement:
- **Try Different Sampling Techniques**: You could experiment with **SMOTE-NC** (SMOTE for mixed-type data) or **ADASYN** for more adaptive oversampling.
- **Use Ensemble Models**: Consider using ensemble methods like **Random Forest** or **Gradient Boosting** (e.g., `XGBoost` or `LightGBM`) with **balanced class weights**.
- **Tune Hyperparameters**: Adjust hyperparameters for `LogisticRegression` (e.g., `C` value for regularization) and try techniques like **grid search** or **random search** for optimal results.
- **Evaluate with Different Metrics**: If you're dealing with an imbalanced problem, focus more on metrics like **precision-recall AUC**, **F1-score**, and **confusion matrix** rather than just accuracy.

### Next Steps:
1. **Plot Precision-Recall Curve**: Visualize how precision and recall change with different threshold values.
2. **Cross-Validation**: Ensure you're validating your model using cross-validation to get a more robust estimate of performance.
3. **Try Class Weights**: Combining SMOTE with `class_weight='balanced'` may help optimize results without losing generality.

In [118]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_jobs=300,
    criterion='entropy',
    class_weight='balanced',
    random_state=1,
    max_features=len(features),
    warm_start=True,
)\
    .fit(X_train_balanced, y_train_balanced)
print(classification_report(y_true=y_test_balanced, y_pred=rf.predict(X_test_balanced)))

              precision    recall  f1-score   support

           0       0.69      0.61      0.65       788
           1       0.65      0.73      0.69       788

    accuracy                           0.67      1576
   macro avg       0.67      0.67      0.67      1576
weighted avg       0.67      0.67      0.67      1576



In [119]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_train_balanced, y_train_balanced, cv=5, scoring='f1')
print(f'Cross-Validation F1 Score: {scores.mean()}')

Cross-Validation F1 Score: 0.7725919260601385


In [132]:
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier(
    n_estimators=1000,
    n_jobs=200,
    grow_policy='lossguide',
    learning_rate=0.001,
    booster='gbtree',
    random_state=1,
    tree_method='exact',
    eval_metric='logloss'
).fit(X=X_train_balanced, y=y_train_balanced)

print(classification_report(y_true=y_test_balanced, y_pred=xgb_classifier.predict(X_test_balanced)))

              precision    recall  f1-score   support

           0       0.67      0.92      0.78       788
           1       0.88      0.56      0.68       788

    accuracy                           0.74      1576
   macro avg       0.78      0.74      0.73      1576
weighted avg       0.78      0.74      0.73      1576



In [131]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_classifier, X_train_balanced, y_train_balanced, cv=5, scoring='f1')
print(f'Cross-Validation F1 Score: {scores.mean()}')

Cross-Validation F1 Score: 0.74703133779682


# Ensembling based model training

As of now, our best models with balanced dataset of being trained and tested are that of `l_regression` and `xgb_classifier`. We'll combine them up for the best of the results

## VotingClassifier

A simple way to combine models by averaging their predictions. This method combines the predictions from multiple models and selects the most frequent class (hard voting) or the average of probabilities (soft voting) as the final prediction. Here's how you can do it:

In [133]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(
    estimators=[
        ('log_reg', l_regression),
        ('xgb', xgb_classifier)
    ],
    voting='soft'
)

ensemble_model.fit(X_train_balanced, y_train_balanced)

print(classification_report(y_true=y_test_balanced, y_pred=ensemble_model.predict(X_test_balanced)))

              precision    recall  f1-score   support

           0       0.68      0.98      0.80       788
           1       0.96      0.53      0.68       788

    accuracy                           0.75      1576
   macro avg       0.82      0.75      0.74      1576
weighted avg       0.82      0.75      0.74      1576



In [134]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(ensemble_model, X_train_balanced, y_train_balanced, cv=5, scoring='f1')
print(f'Cross-Validation F1 Score: {scores.mean()}')

Cross-Validation F1 Score: 0.7066587760971305


## StackingClassifier

Use the predictions of one set of models as input features for another model to learn how to combine them optimally. A stacking classifier combines the predictions of base models and uses a meta-model to find the optimal combination of them. This is helpful when you want a model to learn how to best combine the predictions from LogisticRegression and XGBoost.

In [136]:
from sklearn.ensemble import StackingClassifier

base_models = [('l_regression', l_regression), ('xgb_classifier', xgb_classifier)]

stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=rf
)
stacked_model.fit(X_train_balanced, y_train_balanced)
print(classification_report(y_true=y_test_balanced, y_pred=stacked_model.predict(X_test_balanced)))

              precision    recall  f1-score   support

           0       0.68      0.90      0.78       788
           1       0.86      0.58      0.69       788

    accuracy                           0.74      1576
   macro avg       0.77      0.74      0.73      1576
weighted avg       0.77      0.74      0.73      1576



In [137]:
scores = cross_val_score(stacked_model, X_train_balanced, y_train_balanced, cv=5, scoring='f1')
print(f'Cross-Validation F1 Score: {scores.mean()}')

Cross-Validation F1 Score: 0.740311276852135


# End notes
While doing this project, I did the following mistakes:
 * **Not identifying the patterns of the spam and non-spam emails**: I should've first identified the pattern of token(s)' occurrences with both spam and non-spam kind of emails.
 * **Simulating randomness**: Instead of randomness by going for `df.sample`, I should've gone for suitable data augmentation technique.
 * **A good dataset**: This dataset is way too small for email classification, for spam classification, there're also emoji, phishing techniques involved as well, and other vice versa.

Here are a few additional points where improvements could be made or explored further:

---

### **1. Lack of Data Preprocessing Evaluation**
   - **Possible Issue**: Preprocessing choices like tokenization, lemmatization, and threshold-based inclusion may not align optimally with the nature of the dataset.
   - **Suggested Improvement**: Evaluate whether preprocessing steps:
     - Retain meaningful information.
     - Don't inadvertently remove critical spam indicators (e.g., numbers, special characters, or URLs).
     - Include domain-specific stopwords (e.g., "click", "free", "offer" for spam classification).
   - Experiment with different preprocessing pipelines and validate their impact on model performance.

---

### **2. Overlooking Model Interpretability**
   - **Possible Issue**: The project could benefit from analyzing which features or tokens contribute most to predictions. Without this, we're working in a "black box" mode.
   - **Suggested Improvement**:
     - Use **SHAP (SHapley Additive exPlanations)** or **LIME (Local Interpretable Model-agnostic Explanations)** to identify the importance of specific tokens.
     - Analyze token contributions for spam vs. non-spam predictions to uncover new insights into patterns and biases.

---

### **3. Focusing Solely on SMOTE Oversampling**
   - **Possible Issue**: While SMOTE balances the dataset, it creates synthetic samples that might not accurately represent the data distribution, especially for high-dimensional data like text.
   - **Suggested Improvement**:
     - Compare SMOTE results with **undersampling**, **class-weight adjustments**, or **other oversampling techniques** like ADASYN.
     - Incorporate **data augmentation** methods to increase diversity without relying solely on resampling algorithms.

---

### **4. Model-Specific Overfitting Risk**
   - **Possible Issue**: With ensemble models like XGBoost and RandomForest, there's a chance of overfitting to the balanced dataset if hyperparameters aren’t carefully tuned.
   - **Suggested Improvement**:
     - Use a **validation set** alongside cross-validation to detect overfitting.
     - Regularize the model using appropriate parameters (e.g., `gamma` for XGBoost, `min_samples_split` for RandomForest).

---

### **5. Overlooking Ensemble Diversity**
   - **Possible Issue**: While the ensemble approach with logistic regression and XGBoost is promising, both models might share similar biases.
   - **Suggested Improvement**:
     - Increase diversity in your ensemble by including fundamentally different algorithms (e.g., Naïve Bayes or SVM with custom kernels).
     - Use a meta-model (e.g., stacking with a Logistic Regression or a LightGBM as a meta-learner) to combine predictions more effectively.

---

### **6. Overreliance on Accuracy and F1 Scores**
   - **Possible Issue**: Metrics like accuracy and F1-score might not fully capture the model's ability to differentiate between spam and non-spam emails.
   - **Suggested Improvement**:
     - Incorporate other evaluation metrics like:
       - **Precision-Recall AUC**: Especially important for imbalanced datasets.
       - **False Positive Rate (FPR)**: To check if the spam filter mistakenly classifies valid emails as spam.

---

### **7. Dataset Size vs. Complexity Trade-off**
   - **Possible Issue**: Applying highly complex models like XGBoost on a small dataset might not fully leverage their capabilities.
   - **Suggested Improvement**:
     - Simplify the model if scaling up the dataset isn’t an option (e.g., using simpler classifiers like Logistic Regression or Decision Trees).
     - Alternatively, explore transfer learning to reduce dependency on dataset size.

---

### **8. Limited Real-World Email Features**
   - **Possible Issue**: Spam classification often involves metadata beyond email text, such as:
     - Sender reputation.
     - Email headers (e.g., "From", "Reply-To").
     - Attached links and domains.
   - **Suggested Improvement**:
     - Enrich the dataset with such features if possible.
     - Use a multi-modal approach where text data and metadata are both included as inputs.

---

### **9. Lack of Robustness Testing**
   - **Possible Issue**: Without robustness testing, the model might perform poorly when applied to unseen, real-world datasets.
   - **Suggested Improvement**:
     - Test the model on an **external dataset** or simulate real-world scenarios, such as:
       - Emails with heavy use of emojis, URLs, or phishing techniques.
       - Emails in different languages or with mixed character sets.

---

### **10. Not Exploring Sequential Patterns**
   - **Possible Issue**: Emails have natural sequences of words and phrases that models like Bag-of-Words or TF-IDF might ignore.
   - **Suggested Improvement**:
     - Experiment with sequential models like **LSTMs**, **GRUs**, or **Transformers** to capture temporal dependencies.
     - Alternatively, explore **n-gram features** for a middle ground.

---

By addressing these points, you can further refine your project and make it robust, scalable, and insightful.

| Platform   | Link                                           |
|------------|------------------------------------------------|
| GitHub     | [JackTheProgrammer](https://github.com/JackTheProgrammer) |
| LinkedIn   | [Fawad Awan](https://www.linkedin.com/in/fawad-awan-893a58171/) |