In [69]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
# Load and preprocess data
df = pd.read_csv('C:\\Users\\qhaas\\Desktop\\REPO\\SpamSMSDetection\\spam.csv', encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [15]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [16]:
#Handling Missing and duplicate value
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [17]:
df.duplicated().sum()

403

In [18]:
df = df.drop_duplicates(keep='first')

In [19]:
df.shape

(5169, 2)

In [21]:
#Label Encoding

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['v1'] = encoder.fit_transform(df['v1'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v1'] = encoder.fit_transform(df['v1'])


Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [22]:
df['v1'].value_counts()

v1
0    4516
1     653
Name: count, dtype: int64

In [32]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qhaas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qhaas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [35]:
#Remove special characters and punctuation, Convert text to lowercase,Remove stop words & Perform stemming .

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in filtered_text]
    return ' '.join(stemmed_text)
df['v2'] = df['v2'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v2'] = df['v2'].apply(preprocess_text)


In [36]:
#Splitting data
X = df['v2']
y = df['v1']
X_train1, X_test1, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train1)
X_test = vectorizer.transform(X_test1)

In [41]:
X_train

<4135x6354 sparse matrix of type '<class 'numpy.int64'>'
	with 34237 stored elements in Compressed Sparse Row format>

In [43]:
y_train

2228    0
5529    0
2149    0
5058    1
5051    0
       ..
4740    0
474     0
3266    0
4016    0
879     1
Name: v1, Length: 4135, dtype: int64

In [61]:
#NAIVE BAYES
nb_classifier=MultinomialNB()

In [62]:
#train data on training data
nb_classifier.fit(X_train, y_train)

In [65]:
y_pred_nb= nb_classifier.predict(X_test)

In [68]:
#Accuracy by NB
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy by NB: {accuracy_nb:.2f}')

# Classification report by NB
report_nb = classification_report(y_test, y_pred_nb)
print(f'Classification Report by NB:\n{report_nb}')

# Confusion matrix by NB
cm_nb = confusion_matrix(y_test, y_pred_nb)
print(f'Confusion Matrix by NB:\n{cm_nb}')

Accuracy by NB: 0.98
Classification Report by NB:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       889
           1       0.96      0.92      0.94       145

    accuracy                           0.98      1034
   macro avg       0.97      0.96      0.96      1034
weighted avg       0.98      0.98      0.98      1034

Confusion Matrix by NB:
[[883   6]
 [ 12 133]]


In [47]:
#LOGISTIC REGRESSION
model= LogisticRegression()

In [48]:
model.fit(X_train, y_train)

In [50]:
X_train_predict_lr= model.predict(X_train)
train_data_acc_lr=accuracy_score(y_train, X_train_predict_lr)

In [70]:
#SVM
model= SVC(kernel='linear')

In [71]:
model.fit(X_train, y_train)

In [76]:
#predtion of test data by SVM
y_pred_svm= model.predict(X_test)

#accuracy score by SVM
accuracy_svm=accuracy_score(y_test, y_pred_svm)
print("Accuracy by SVM=", accuracy_svm)

#classification report by SVM
report_svm=classification_report(y_test, y_pred_svm)
print("Classification Report by SVM=\n", report_svm)

#Confusion matrix by SVM
cm_svm=confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix by SVM:")
print(cm_svm)

Accuracy by SVM= 0.9845261121856866
Classification Report by SVM=
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       889
           1       1.00      0.89      0.94       145

    accuracy                           0.98      1034
   macro avg       0.99      0.94      0.97      1034
weighted avg       0.98      0.98      0.98      1034

Confusion Matrix by SVM:
[[889   0]
 [ 16 129]]
