<a href="https://colab.research.google.com/github/Moostafaaa/CognoRise-InfoTech/blob/main/SPAM_EMAIL_DETECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# for EDA and Visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# for preprocessing and modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d mfaisalqureshi/spam-email
! unzip spam-email.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/mfaisalqureshi/spam-email
License(s): CC0-1.0
spam-email.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  spam-email.zip
replace spam.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [4]:
df = pd.read_csv('spam.csv')

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import re
import string
import nltk


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


# Text Preprocessing

## Cleaning HTML tags and punctuations

In [11]:

def clean_html(sentence):
    clean_tags = re.compile('<.*?>')
    clean_text = re.sub(clean_tags, ' ', sentence)
    return clean_text

def clean_punc(sentence):
    one = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',one)
    return  cleaned

In [12]:
df['clean_html'] = df.Message.apply(clean_html)
df['clean_html_punc'] = df.clean_html.apply(clean_punc)
df['clean_html_punc'][10]

'Im gonna be home soon and i dont want to talk about this stuff anymore tonight  k Ive cried enough today '

In [13]:
df['clean_html_punc'][1400]

'You have registered Sinco as Payee  Log in at icicibank com and enter URN  &lt;&gt;  to confirm  Beware of frauds  Do NOT share or disclose URN to anyone '

## Removing Stopwords

In [14]:

def remove_stop(text):
  # Tokenize the text into words
  words = word_tokenize(text)

  # Filter out the stop words from the text
  filtered_words = [word for word in words if not word in stopwords.words('english')]

  # Join the filtered words into a string
  filtered_text = ' '.join(filtered_words)

  # return the filtered text
  return(filtered_text)

In [15]:
df['no_stop'] = df.clean_html_punc.apply(remove_stop)
df['no_stop'][1400]

'You registered Sinco Payee Log icicibank com enter URN & lt ; & gt ; confirm Beware frauds Do NOT share disclose URN anyone'

## Lemmatization

In [16]:

# prepare spacy model for lemmatization
import spacy.cli
spacy.cli.download("en_core_web_lg")



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
nlp = spacy.load("en_core_web_lg")

In [19]:
# defining lemmatization function
def lemm(text):
    lemme=[]
    for token in nlp(text):
        lemme.append(token.lemma_)

    return " ".join(lemme)

In [20]:

df['lemma'] = df.no_stop.apply(lemm)
df['lemma'][1400]

'you register Sinco Payee Log icicibank com enter URN & lt ; & gt ; confirm Beware fraud do not share disclose URN anyone'

# Feature Extraction : TF-IDF Vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [22]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,3),sublinear_tf=False)

In [23]:
x = df.lemma

In [24]:
df.Category = df.Category.apply(lambda x: 1 if x == 'spam' else 0)

In [25]:
y = df.Category

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True,stratify=y)


In [29]:
tfidf.fit(x_train)
x_train_tfidf = tfidf.transform(x_train)
x_test_tfidf = tfidf.transform(x_test)
print(x_train_tfidf.shape)
print(x_test_tfidf.shape)

(4457, 5000)
(1115, 5000)


# Model Selection

In [36]:
lr_tf = LogisticRegression(random_state=42, class_weight='balanced')
lr_tf.fit(x_train_tfidf, y_train)
y_pred1 = lr_tf.predict(x_test_tfidf)
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.91      0.92      0.91       149

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [42]:
LR_tf = LogisticRegression(max_iter=500, C=2.1, penalty='elasticnet', solver='saga', l1_ratio= 0.5, class_weight='balanced')
LR_tf.fit(x_train_tfidf, y_train)
y_pred = LR_tf.predict(x_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       966
           1       0.90      0.92      0.91       149

    accuracy                           0.97      1115
   macro avg       0.94      0.95      0.95      1115
weighted avg       0.98      0.97      0.98      1115



In [43]:
RF = RandomForestClassifier()
DT = DecisionTreeClassifier()
GB = GradientBoostingClassifier()
XGB = XGBClassifier()
KNN = KNeighborsClassifier()

In [45]:
models = [LR_tf, RF, DT, GB, XGB, KNN]

for model in models:
    model.fit(x_train_tfidf, y_train)
    y_pred = model.predict(x_test_tfidf)
    print(f"Model: {model.__class__.__name__}")
    print(classification_report(y_test, y_pred))

Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       966
           1       0.90      0.92      0.91       149

    accuracy                           0.97      1115
   macro avg       0.94      0.95      0.95      1115
weighted avg       0.98      0.97      0.98      1115

Model: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Model: DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       966
           1       0.90      0.81      0.86       149

    accuracy                           0.96      1115
   macro avg       0.94      0.90      0.9