In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 500)

In [11]:
ds = pd.read_csv("SMSSpamCollection.txt", delimiter="\t", names=("is_spam", "text"))
ds['is_spam'].replace({'spam': 1, 'ham': 0}, inplace=True)

In [12]:
ds.head()

Unnamed: 0,is_spam,text
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


In [13]:
ds.is_spam.value_counts()

0    4825
1     747
Name: is_spam, dtype: int64

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(ds.text)
y = ds.is_spam

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

cls = LogisticRegression()
res = cross_val_score(cls, X, y, scoring="f1", cv=10)
print(np.mean(res))

0.932640298361


In [18]:
model = cls.fit(X, y)

In [19]:
messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"]

In [23]:
predictions = model.predict(vectorizer.transform(messages))
print(' '.join(map(str, predictions)))

1 1 0 0 0


In [26]:
ngram_ranges = ((2,2), (3,3), (1,3))
for ngram_range in ngram_ranges:
    res = cross_val_score(cls, CountVectorizer(ngram_range=ngram_range).fit_transform(ds.text), y, scoring="f1", cv=10)
    print('{0:.2f}'.format(np.mean(res)), end=' ', )

0.82 0.73 0.93 

In [27]:
from sklearn.naive_bayes import MultinomialNB
mb_clf = MultinomialNB()
for ngram_range in ngram_ranges:
    res = cross_val_score(mb_clf, CountVectorizer(ngram_range=ngram_range).fit_transform(ds.text), y, scoring="f1", cv=10)
    print('{0:.2f}'.format(np.mean(res)), end=' ', )

0.65 0.38 0.89 

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
res = cross_val_score(cls, TfidfVectorizer().fit_transform(ds.text), y, scoring="f1", cv=10)
print(np.mean(res))

0.852859955417


In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords

tfidf_svm_pipeline = Pipeline([
        ('Vectorizer', TfidfVectorizer()),
        ('Classifier', LogisticRegression())
    ])

sw = stopwords.words("english")
params = {
    'Vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4)],
    'Vectorizer__stop_words' : [None, sw[:10], sw[:50], sw[:100], sw],
    'Classifier__C': [500, 1000, 10000, 15000, 20000],
    'Classifier__penalty': ['l1', 'l2']
}
clf_cv = GridSearchCV(tfidf_svm_pipeline, params, cv = 10, n_jobs=4, verbose=1, scoring="f1")
clf_ = clf_cv.fit(ds.text, y)

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 13.4min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 21.7min
[Parallel(n_jobs=4)]: Done 1500 out of 1500 | elapsed: 26.3min finished


In [45]:
print(clf_.best_params_)
print(clf_.best_score_)

{'Vectorizer__ngram_range': (1, 3), 'Classifier__C': 500, 'Classifier__penalty': 'l2', 'Vectorizer__stop_words': None}
0.953307716972
