In [2]:
import pandas as pd
import numpy as np

## 1. Reading the data

In [4]:
path = '../TextFiles/smsspamcollection.tsv'
df = pd.read_csv(path, sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.describe()

Unnamed: 0,length,punct
count,5572.0,5572.0
mean,80.48995,4.177495
std,59.942907,4.623919
min,2.0,0.0
25%,36.0,2.0
50%,62.0,3.0
75%,122.0,6.0
max,910.0,133.0


In [8]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [9]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
df.shape

## 2. Preprocessing

In [38]:
import spacy

nlp = spacy.load('en')

def TextProcessor(atext):
    # perform lowercase, remove punctuation and perform lemmatization
    atext = atext.lower()
    doc = nlp(atext)
    return " ".join([token.lemma_ for token in doc if (not token.is_punct and not token.is_stop)])

testtext = "Hello, this is Jay! What are you doing"
TextProcessor(testtext)

'hello jay'

## 3. Extracting TF-IDF features and Model training

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, make_scorer, confusion_matrix
# removes stopwords, tokenization, tfidf-features

# split train and test
xtrain,xtest, ytrain, ytest = train_test_split(list(df['message']),list(df['label']), test_size=0.2, random_state= 42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=TextProcessor) ),
    ('clf', LinearSVC())
])

In [66]:
ytrain[0]

'spam'

In [67]:
pipeline.fit(xtrain,ytrain)
ypred = pipeline.predict(xtest)
confusion_matrix(ytest,ypred)

  if LooseVersion(joblib_version) < '0.12':


array([[964,   2],
       [  9, 140]], dtype=int64)

In [69]:
test_f1 = f1_score(ytest,ypred, average= 'weighted')
test_f1

0.9900336799911834

In [52]:
param_grid = {
    'tfidf__ngram_range': [(1,2)],
    'clf__C': [0.1,1,10],
    'clf__kernel': ['rbf'],
    'clf__gamma': ['auto'],
}

scorer = make_scorer(f1_score, average = 'weighted')
gridSearch = GridSearchCV(pipeline, param_grid, cv=5, scoring= scorer, verbose= 1)

gridSearch.fit(xtrain,ytrain)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  if LooseVersion(joblib_version) < '0.12':
  'precision', 'predicted', average, warn_for)
  if LooseVersion(joblib_version) < '0.12':
  if LooseVersion(joblib_version) < '0.12':
  if LooseVersion(joblib_version) < '0.12':


KeyboardInterrupt: 

In [None]:
# evaluate the model
best_model = gridSearch.best_estimator_
ypred = best_model.predict(xtest)
test_f1 = f1_score(ytest,ypred, average= 'weighted')

In [None]:
test_f1

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,ypred)