Importing the Dependecies

In [80]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#classifcation models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#importing the perforamnces evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Data Collection and Preprocessing

In [81]:
#loading the dataset to the pandas DataFrame
df = pd.read_csv('/content/spam.csv',sep = "\t", encoding='latin')

In [82]:
print(df)

                                               v1,v2,,,
0     ham,"Go until jurong point, crazy.. Available ...
1                  ham,Ok lar... Joking wif u oni...,,,
2     spam,Free entry in 2 a wkly comp to win FA Cup...
3     ham,U dun say so early hor... U c already then...
4     ham,"Nah I don't think he goes to usf, he live...
...                                                 ...
5569  spam,"This is the 2nd time we have tried 2 con...
5570       ham,Will Ì_ b going to esplanade fr home?,,,
5571  ham,"Pity, * was in mood for that. So...any ot...
5572  ham,The guy did some bitching but I acted like...
5573                  ham,Rofl. Its true to its name,,,

[5574 rows x 1 columns]


In [83]:
#replcaing the null values with a null string
df = df.where((pd.notnull(df)),'')

In [84]:
#printing the first five rows of the dataFrame
df.head()

Unnamed: 0,"v1,v2,,,"
0,"ham,""Go until jurong point, crazy.. Available ..."
1,"ham,Ok lar... Joking wif u oni...,,,"
2,"spam,Free entry in 2 a wkly comp to win FA Cup..."
3,"ham,U dun say so early hor... U c already then..."
4,"ham,""Nah I don't think he goes to usf, he live..."


In [85]:
#printing the last five rows of the DataFrame
df.tail()

Unnamed: 0,"v1,v2,,,"
5569,"spam,""This is the 2nd time we have tried 2 con..."
5570,"ham,Will Ì_ b going to esplanade fr home?,,,"
5571,"ham,""Pity, * was in mood for that. So...any ot..."
5572,"ham,The guy did some bitching but I acted like..."
5573,"ham,Rofl. Its true to its name,,,"


In [86]:
#checking for the number of rows and columns in the dataframe
df.shape

(5574, 1)

In [87]:
#creating the label column
df['label'] = df['v1,v2,,,'].apply(lambda x: re.findall("^ham|spam", x)[0])

In [88]:
#checking the first five rows again
df.head()

Unnamed: 0,"v1,v2,,,",label
0,"ham,""Go until jurong point, crazy.. Available ...",ham
1,"ham,Ok lar... Joking wif u oni...,,,",ham
2,"spam,Free entry in 2 a wkly comp to win FA Cup...",spam
3,"ham,U dun say so early hor... U c already then...",ham
4,"ham,""Nah I don't think he goes to usf, he live...",ham


In [89]:
# Renaming the v1,v2,, columns
df.rename(columns={'v1,v2,,,':'message'}, inplace=True)

In [90]:
df.head()

Unnamed: 0,message,label
0,"ham,""Go until jurong point, crazy.. Available ...",ham
1,"ham,Ok lar... Joking wif u oni...,,,",ham
2,"spam,Free entry in 2 a wkly comp to win FA Cup...",spam
3,"ham,U dun say so early hor... U c already then...",ham
4,"ham,""Nah I don't think he goes to usf, he live...",ham


In [91]:
port_stem = PorterStemmer()
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [92]:
#creating a function for cleaning and stemming the textual feature
def cleaning(message):
  cleaned_message = re.sub('^ham,|spam,','', message)
  cleaned_message = re.sub('[^a-zA-Z]',' ', cleaned_message)
  cleaned_message = cleaned_message.lower()
  cleaned_message = cleaned_message.split()
  cleaned_message = [port_stem.stem(word) for word in cleaned_message if not word in stopwords.words('english')]
  cleaned_message = ' '.join(cleaned_message)

  return cleaned_message

In [93]:
#applying the cleaning function to the textual feature
df['message'] = df['message'].apply(cleaning)

In [94]:
df.head()

Unnamed: 0,message,label
0,go jurong point crazi avail bugi n great world...,ham
1,ok lar joke wif u oni,ham
2,free entri wkli comp win fa cup final tkt st m...,spam
3,u dun say earli hor u c alreadi say,ham
4,nah think goe usf live around though,ham


Label Encoding

In [95]:
#label spam mail as 1; ham mail as 0;
#using the mapping methos
map = {'spam':0,'ham':1}
df['label']  = df['label'].map(map)

In [96]:
#coverting the data type from object type to an Int64 type
df['label'] = df['label'].astype('Int64')

Spam - 0

Ham - 1

In [97]:
df.head()

Unnamed: 0,message,label
0,go jurong point crazi avail bugi n great world...,1
1,ok lar joke wif u oni,1
2,free entri wkli comp win fa cup final tkt st m...,0
3,u dun say earli hor u c alreadi say,1
4,nah think goe usf live around though,1


In [98]:
#separating the text and the label data
X = df['message']
Y = df['label']

In [99]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,4827
0,747


In [53]:
X.shape, Y.shape

((5574,), (5574,))

In [28]:
print(X)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5569    nd time tri contact u u pound prize claim easi...
5570                                b go esplanad fr home
5571                                    piti mood suggest
5572    guy bitch act like interest buy someth els nex...
5573                                       rofl true name
Name: message, Length: 5574, dtype: object


In [101]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase= True)

X = feature_extraction.fit_transform(X)

**Data Sampling**:
 Oversampling

In [105]:
#oversamppling the target variable
#importing the Randomoversampler
from imblearn.over_sampling import RandomOverSampler

# Define the oversampler
ros = RandomOverSampler(sampling_strategy=1, random_state=42)
# Perform the oversampling
X, Y = ros.fit_resample(X, Y)

# Verify the class distribution
Y.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,4827
0,4827


Splitting into Train and Test

In [106]:
#splittig the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [107]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((7723, 6166), (1931, 6166), (7723,), (1931,))

Model Training and Prediction

In [108]:
# Create a dictionary to store the classification models
models = {
    #'SVC': SVC(kernel='poly'),
    'RandomForest': RandomForestClassifier(),
    'KNeighbors': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(max_iter= 1000)
}

In [109]:
# Loop through the models, train, test, and print results
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, Y_train)

    # Test the model
    predictions = model.predict(X_test)

    # Calculate accuracy for the test data
    accuracy = accuracy_score(Y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy}")

    # using the classification Report to evaluate model
    print(f'{model_name}: Classification Report')
    print(classification_report(Y_test, predictions))

    # Calculate confusion matrix
    cm = confusion_matrix(Y_test, predictions)
    print(f"{model_name} Confusion Matrix:")
    print(np.array2string(cm, separator=', '))

    print("\n" + "="*40 + "\n")

RandomForest Accuracy: 0.9994821336095288
RandomForest: Classification Report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       972
         1.0       1.00      1.00      1.00       959

    accuracy                           1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931

RandomForest Confusion Matrix:
[[972,   0],
 [  1, 958]]


KNeighbors Accuracy: 0.9870533402382186
KNeighbors: Classification Report
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99       972
         1.0       0.97      1.00      0.99       959

    accuracy                           0.99      1931
   macro avg       0.99      0.99      0.99      1931
weighted avg       0.99      0.99      0.99      1931

KNeighbors Confusion Matrix:
[[947,  25],
 [  0, 959]]


LogisticRegression Accuracy: 0.9922320041429311
LogisticRegression: Classifica