# Training SVM model to detect spam texts

In [16]:
import pandas as pd

df = pd.read_csv('spam.csv', delimiter=',', encoding='latin-1')

df.head(n=10)

X = df['Text'].values
y = df['Label'].values

In [17]:
# pre-processing stop words
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')


stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def pre_process(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(ps.stem(w))
    return " ".join(filtered_sentence)

X = [pre_process(text) for text in X]

[nltk_data] Downloading package stopwords to /home/viking/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/viking/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# vectorization
# TF-IDF (Term Frequency - Inverse Document Frequency)
# TF = (Number of times term t appears in a document)/(Number of terms in the document)
# IDF = log_e(Total number of documents/Number of documents with term t in it)
# TF-IDF = TF * IDF
# TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# split our data from training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# train our model
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# test our model
y_pred = clf.predict(X_test)

In [20]:
# evaluate our model
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       949
        spam       0.99      0.84      0.91       166

    accuracy                           0.97      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.97      0.97      1115

0.9748878923766816
