In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import nltk
import os
import warnings

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

%matplotlib inline
warnings.filterwarnings('ignore')

os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # Adjust the number based on your system

In [2]:
# Download NLTK data files (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Soban\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mohammad
[nltk_data]     Soban\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Mohammad
[nltk_data]     Soban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Now finding the count of duplicates in the dataset
print(f"The number of duplicate rows is {df.duplicated().sum()} from a total of {df.shape[0]} rows")
      
# Dropping the duplicates
df.drop_duplicates(inplace=True, keep='first')

# Checking the shape of the dataset after dropping duplicates
print(f"The number of duplicate rows is {df.duplicated().sum()} from a total of {df.shape[0]} rows after dropping duplicates")

The number of duplicate rows is 403 from a total of 5572 rows
The number of duplicate rows is 0 from a total of 5169 rows after dropping duplicates


In [5]:
# Now dropping the unwanted columns which starts with 'Unnamed'
df.drop(df.columns[df.columns.str.contains('Unnamed')], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Renaming the columns
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Label Encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Applying porter stemmer
ps = PorterStemmer()

def clean_text(text):
    text = text.lower()  # Converting to lowercase

    # Tokenizing the text
    text = word_tokenize(text)
    
    # Removing special characters
    y = [word for word in text if word.isalnum()]

    text = y[:]
    y.clear()

    # Removing stopwords and punctuation
    for word in text:
        if word not in stopwords.words('english') and word not in string.punctuation:
            y.append(word)
        
    text = y[:]
    y.clear()

    for word in text:
        y.append(ps.stem(word))

    return " ".join(y)

In [9]:
df['transformed_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,label,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [10]:
tfid = TfidfVectorizer(max_features = 500)

X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['label'].values

In [11]:
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 42)

In [12]:
svc = SVC(kernel="sigmoid", gamma=1.0)
knc = KNeighborsClassifier(n_jobs=4)
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1', n_jobs=4)
rfc = RandomForestClassifier(n_estimators=50, random_state=2, n_jobs=4)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2, n_jobs=4)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2, n_jobs=4)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50, random_state=2, n_jobs=4)

In [13]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [14]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [None]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9709864603481625
Precision:  0.952755905511811

For:  KNN
Accuracy:  0.9294003868471954
Precision:  0.9736842105263158

For:  NB
Accuracy:  0.9758220502901354
Precision:  0.9838709677419355

For:  DT
Accuracy:  0.9313346228239845
Precision:  0.8363636363636363

For:  LR
Accuracy:  0.9564796905222437
Precision:  0.8968253968253969

For:  RF
Accuracy:  0.971953578336557
Precision:  0.953125

For:  Adaboost
Accuracy:  0.9535783365570599
Precision:  0.8702290076335878
