In [1]:
import seaborn as sb
import pandas as pd
import numpy as np
import nltk 
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from nltk.stem import WordNetLemmatizer

In [2]:
# Read the CSV file with the specified encoding
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

# Now, you can work with the DataFrame 'df'


In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis='columns',inplace=True)

In [5]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df.rename(columns={'v1':'spam' ,'v2':'text'},inplace=True)


In [7]:
df.replace({'ham':0,'spam':1},inplace=True)

In [8]:
df

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [9]:
new_data=df[['spam','text']]
new_data

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [10]:
new_data.isna().sum()

spam    0
text    0
dtype: int64

In [11]:
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'@\w+','',x))
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'http\S+','',x))
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'\W',' ',x))
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'\s',' ',x))

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hiten\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hiten\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
stop_words = set(stopwords.words('english'))

In [14]:
new_data['text'] = new_data['text'].apply(lambda x:word_tokenize(x.lower()))
new_data['text'] = new_data['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [15]:
new_data

Unnamed: 0,spam,text
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"
...,...,...
5567,1,"[2nd, time, tried, 2, contact, u, u, å, 750, p..."
5568,0,"[ì_, b, going, esplanade, fr, home]"
5569,0,"[pity, mood, suggestions]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


In [16]:
new_data['text'] = new_data['text'].apply(lambda x:' '.join(x))

In [17]:
new_data

Unnamed: 0,spam,text
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah think goes usf lives around though
...,...,...
5567,1,2nd time tried 2 contact u u å 750 pound prize...
5568,0,ì_ b going esplanade fr home
5569,0,pity mood suggestions
5570,0,guy bitching acted like interested buying some...


In [18]:
 X_train, X_test, y_train,y_test = train_test_split(new_data['text'], new_data['spam'], test_size=0.20, random_state=42)

In [19]:
sentiment_counts = new_data['spam'].value_counts()
print(sentiment_counts)

0    4825
1     747
Name: spam, dtype: int64


# 5.Features Extraction

In [20]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_text_vec = vectorizer.transform(X_test)


In [21]:
X_text_vec

<1115x7578 sparse matrix of type '<class 'numpy.float64'>'
	with 8469 stored elements in Compressed Sparse Row format>

# 6.Model Training

In [22]:
from sklearn.svm import SVC

# Initialize an SVM model
svm_model = SVC(kernel='linear', C=1.0)

# Train the SVM model
svm_model.fit(X_train_vec, y_train)


SVC(kernel='linear')

# 7.Model Evaluation

In [23]:
#Model Evaluation for SVM
y_pred = svm_model.predict(X_text_vec)

In [24]:
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)
print(classification_report(y_test,y_pred))

Accuracy: 0.9802690582959641
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

