In [2]:
import nltk
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import string


data = pd.read_csv("../Data/SMSSpamCollection.txt", sep="\t", header=None)
data.columns = ["label", "Content"]

en_stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

data["Content_len"] = data["Content"].apply(lambda x: len(x) - x.count(" "))


def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation]
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation / (total), 4) * 100


data["punctuation_rate"] = data["Content"].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split(r"\W+", result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text


vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data["Content"])


all_data = pd.concat(
    [pd.DataFrame(vect_final.toarray()), data["Content_len"], data["punctuation_rate"]],
    axis=1,
)

In [3]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [5]:
alg_svm = svm.SVC()
params = {"kernel": ["linear", "rbf"], "gamma": [0.01, 0.001]}

all_data.columns = all_data.columns.astype(str)

hyper_params_grid = GridSearchCV(alg_svm, params, cv=4, n_jobs=-1)
hyper_params_models = hyper_params_grid.fit(all_data, data["label"])

In [6]:
pd.DataFrame(hyper_params_models.cv_results_).sort_values(
    "mean_test_score", ascending=False
)[0:6]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,398.312823,50.104848,12.526991,2.421152,0.01,linear,"{'gamma': 0.01, 'kernel': 'linear'}",0.985642,0.984207,0.979182,0.983489,0.98313,0.002408,1
2,270.121352,25.874294,8.195622,2.271578,0.001,linear,"{'gamma': 0.001, 'kernel': 'linear'}",0.985642,0.984207,0.979182,0.983489,0.98313,0.002408,1
1,193.088623,4.111605,41.896946,0.890931,0.01,rbf,"{'gamma': 0.01, 'kernel': 'rbf'}",0.901651,0.910266,0.902369,0.906676,0.90524,0.00348,3
3,44.522196,4.118259,22.698029,4.38126,0.001,rbf,"{'gamma': 0.001, 'kernel': 'rbf'}",0.898062,0.911701,0.897344,0.903805,0.902728,0.005754,4
