# Spam Dectector

## Imports

In [None]:
import numpy as np
import pandas as pd
import nltk, string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
file_name = "https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/spam.csv"
data = pd.read_csv(file_name, encoding = "latin-1")
data = data[['v1', 'v2']]
data = data.rename(columns = {'v1': 'label', 'v2': 'text'})
data.head()

In [None]:
def review_messages(msg):
    # converting messages to lowercase
    return msg.lower()

def alternative_review_messages(msg):
    lemmatizer = WordNetLemmatizer()
    # converting messages to lowercase
    msg.lower()

    # translate the nltk pos to wordnet
    msg = [lemmatizer.lemmatize(word) for word in msg.split(' ') ]

    # removing punctuation 
    msg = " ".join(msg)
    msg = [letter for letter in msg if letter not in string.punctuation]

    # removing stopwords 
    msg = "".join(msg)
    msg = [word for word in msg.split(' ') if word not in stopwords.words('english')]
    
    msg = " ".join(msg)

    return msg

In [None]:
# Processing text messages
data['text'] = data['text'].apply(alternative_review_messages)
data.head()

In [None]:
# train test split 
x = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

# training vectorizer
Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(data['text'])

Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [None]:
# training an SVM classifier 

SVM = svm.SVC(C = 1.5, kernel = 'linear', tol = 0.1)

SVM.fit(Train_X_Tfidf, y_train)

In [None]:
# testing against testing set 
y_pred = SVM.predict(Test_X_Tfidf)

print(f"Matrice de confusion: \n{confusion_matrix(y_test, y_pred)}\n")
print(f"Precision: {round(accuracy_score(y_test, y_pred), 3) * 100}%\n")
print(classification_report(y_test, y_pred))

In [None]:
# test against new messages 

with open ("SMSSpamCollection.txt", 'r') as file:
    data = file.read().split('\n')

data = pd.read_csv('SMSSpamCollection.txt', sep = '\t', names = ['label', 'text'])
data['text'] = data['text'].apply(alternative_review_messages)

X_new = data['text']
y_new = data['label']

# training vectorizer
New_X_Tfidf = Tfidf_vect.transform(X_new)

y_pred = SVM.predict(New_X_Tfidf)

print(f"Matrice de confusion: \n{confusion_matrix(y_new, y_pred)}\n")
print(f"Precision: {round(accuracy_score(y_new, y_pred), 3) * 100}%\n")
print(classification_report(y_new, y_pred))