## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

## Data Pre-processing

In [4]:
# Load the data set to pandas DataFrame
raw_mail_data = pd.read_csv('C:/Users/HP/Desktop/ML and AI/Perfect_Plan_B/spamham.csv')

In [5]:
# Check the first 10 rows
raw_mail_data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
# Check the dimensions
raw_mail_data.shape

(5572, 2)

In [9]:
# Finding if there is missing values
raw_mail_data.isna().sum()

Category    0
Message     0
dtype: int64

In [10]:
# Encode the target class
raw_mail_data.loc[raw_mail_data['Category'] == 'spam' , 'Category'] = 0
raw_mail_data.loc[raw_mail_data['Category'] == 'ham' , 'Category'] = 1

In [11]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Seperate the data as text(x) and label(y)
x = raw_mail_data['Message']
y = raw_mail_data['Category']

## Train_test_split

In [13]:
# split the data into train and test data sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 3)

## Feature Extraction

In [14]:
# Transform the text data to feature vectors that can be used as input to SV M model using TfidfVectorizer
# convert text to lowercase letters
feature_extraction = TfidfVectorizer(stop_words = 'english')
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# convert y_train and y_test values 
y_train = y_train.astype('int')
y_test = y_test.astype('int')

## Building the Model

In [15]:
# Training the SVM
model = LinearSVC()
model.fit(x_train_features, y_train)

LinearSVC()

# Evaluation of the model

In [17]:
# prediction on training data
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data =  accuracy_score(y_train, prediction_on_training_data)
print('Accuracy on training data : ',accuracy_on_training_data)

Accuracy on training data :  0.9993269015032533


In [18]:
# Prediction on test data
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data =  accuracy_score(y_test, prediction_on_test_data)
print('Accuracy on test data : ',accuracy_on_test_data)

Accuracy on test data :  0.9820627802690582


In [21]:
from sklearn.metrics import classification_report
print('Classification report : \n\n',classification_report(y_test, prediction_on_test_data))

Classification report : 

               precision    recall  f1-score   support

           0       0.99      0.88      0.93       155
           1       0.98      1.00      0.99       960

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

