In [181]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv('data_spam.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zahran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zahran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zahran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [182]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [183]:
# Dropping the null columns
data.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
# Renamining the columns to meaningful names
data = data.rename(columns={'v1': 'label', 'v2': 'text'})

In [184]:
duplicates = data.duplicated()
duplicates.sum()

403

In [185]:
# Dropping the duplicates
data = data.drop_duplicates()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5169 non-null   object
 1   text    5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


In [186]:
# Lowering all the characters in the text column
data['text'] = data['text'].str.lower()

In [187]:
# Keeping only english letters
for index, row in data.iterrows():
    row['text'] = re.sub(r"http\S+|www\S+", "", row['text'])
    row['text'] = re.sub('[\W_]+', ' ', row['text'])
    row['text'] = re.sub(r'[0-9]', '', row['text'])

In [188]:
# Removing the stop words in the text column (the, at, etc...)
stop = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [189]:
# Tokenization
data['tokens'] = data['text'].apply(nltk.word_tokenize)

In [190]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
data['lemmatizeList']= ''
for index, row in data.iterrows():
    lemmatizelist = []
    for i in range(len(row['tokens'])):
         lemmatizelist.append(lemmatizer.lemmatize(row['tokens'][i]))
    row['lemmatizeList'] = lemmatizelist

In [191]:
# Making a string column and putting the lemmatized words in the lemmatizeList column in each row separated with a space to be passed to the TF-IDF.
data['lemmatizeString'] = ''
for index, row in data.iterrows():
    lemmatizeString = ''
    for i in range(len(row['lemmatizeList'])):
        if i == len(row['lemmatizeList']) - 1:
            lemmatizeString += row['lemmatizeList'][i]
        else:
            lemmatizeString += row['lemmatizeList'][i] + " "
    row['lemmatizeString'] = lemmatizeString

In [192]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

TfIdf = TfidfVectorizer()
x = TfIdf.fit_transform(data['lemmatizeString'])

tfidf_tokens = TfIdf.get_feature_names()
df_tfidfvect = pd.DataFrame(data = x.toarray(), columns = tfidf_tokens)
df_tfidfvect

Unnamed: 0,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zed,zero,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
import sklearn
from sklearn.model_selection import train_test_split

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

label_map = {'spam': 1, 'ham': 0}


# Use map function to apply label_map to label_column
data['label'] = data['label'].map(label_map)
data['label'] = data['label'].astype(int)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, data['label'], test_size=0.2, random_state=42)

In [194]:
data.head(5)

Unnamed: 0,label,text,tokens,lemmatizeList,lemmatizeString
0,0,go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...,"[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,...",free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,0,nah think goes usf lives around though,"[nah, think, goes, usf, lives, around, though]","[nah, think, go, usf, life, around, though]",nah think go usf life around though


In [195]:
# Saving the dataframe after the preprocessing to a csv file.
data.to_csv("test.csv")

In [196]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Train a Naive Bayes classifier on the training data
clf = MultinomialNB().fit(X_train, y_train)

# Use the classifier to predict the labels for the testing data
y_pred = clf.predict(X_test)

# Compute the accuracy of the classifier on the testing data
target_names = ['spam', 'ham']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        spam       0.96      1.00      0.98       889
         ham       0.99      0.74      0.85       145

    accuracy                           0.96      1034
   macro avg       0.98      0.87      0.91      1034
weighted avg       0.96      0.96      0.96      1034



In [197]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, confusion_matrix

# Train a random forest classifier on the training data
clf = RandomForestClassifier().fit(X_train, y_train)

# Use the classifier to predict the labels for the testing data
y_pred = clf.predict(X_test)

# Compute the accuracy of the classifier on the testing data
target_names = ['spam', 'ham']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        spam       0.97      1.00      0.98       889
         ham       1.00      0.79      0.88       145

    accuracy                           0.97      1034
   macro avg       0.98      0.90      0.93      1034
weighted avg       0.97      0.97      0.97      1034



In [198]:
# Support Vector Machine SVM
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, confusion_matrix

# Train an SVM classifier on the training data
clf = SVC().fit(X_train, y_train)

# Use the classifier to predict the labels for the testing data
y_pred = clf.predict(X_test)

# Compute the accuracy of the classifier on the testing data
target_names = ['spam', 'ham']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        spam       0.97      1.00      0.98       889
         ham       0.98      0.81      0.89       145

    accuracy                           0.97      1034
   macro avg       0.98      0.91      0.94      1034
weighted avg       0.97      0.97      0.97      1034



In [199]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, confusion_matrix

# Train a decision tree classifier on the training data
clf = DecisionTreeClassifier().fit(X_train, y_train)

# Use the classifier to predict the labels for the testing data
y_pred = clf.predict(X_test)

# Compute the accuracy of the classifier on the testing data
target_names = ['spam', 'ham']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        spam       0.97      0.97      0.97       889
         ham       0.82      0.82      0.82       145

    accuracy                           0.95      1034
   macro avg       0.89      0.90      0.89      1034
weighted avg       0.95      0.95      0.95      1034

