In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 500)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Задание 1. Классификация	текстов:	спам-фильтр	для	SMS.

### 1,2)

In [2]:
df = pd.read_csv('smsspamcollection/SMSSpamCollection.txt', delimiter='\t', names = ['label', 'sms'])
df

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


### 3)

In [3]:
X = df.sms
Y = np.array([1 if e == "spam" else 0 for e in df.label])

In [4]:
print("spam : %d, non-spam : %d" % (len(Y[Y==1]), len(Y[Y==0])))

spam : 747, non-spam : 4825


### 4)

In [5]:
vectorizer = CountVectorizer()#ngram_range=(1,2))
X = vectorizer.fit_transform(X)

### 5)

In [6]:
clf = LogisticRegression()
res = cross_val_score(clf, X, Y, scoring="f1", cv=10)
print("mean f1-score: ", np.mean(res))
print("std f1-score:", np.std(res))

mean f1-score:  0.932640298361
std f1-score: 0.0195638214575


### 6)

In [7]:
clf.fit(X, Y)
sms_to_predict = ["FreeMsg: Txt: CALL to  No: 86888 & claim your reward of 3 hours talk time to use from your phone   now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"]
for prediction in zip(sms_to_predict, clf.predict(vectorizer.transform(sms_to_predict))):
    print(*prediction)

FreeMsg: Txt: CALL to  No: 86888 & claim your reward of 3 hours talk time to use from your phone   now! Subscribe6GB 1
FreeMsg: Txt: claim your reward of 3 hours talk time 1
Have you visited the last lecture on physics? 0
Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$ 0
Only 99$ 0


### 7)

In [8]:
clf = LogisticRegression()
for n_gram_range in [(1,1), (2,2), (3,3), (1,3)]:
    vectorizer = CountVectorizer(ngram_range = n_gram_range)
    X = df.sms
    X = vectorizer.fit_transform(X)
    res = cross_val_score(clf, X, Y, scoring="f1", cv=10)
    print("ngram range: ", n_gram_range)
    print("mean f1-score: ", np.mean(res))
    print("std f1-score:", np.std(res), "\n")

ngram range:  (1, 1)
mean f1-score:  0.932640298361
std f1-score: 0.0195638214575 

ngram range:  (2, 2)
mean f1-score:  0.822422066419
std f1-score: 0.0256477437263 

ngram range:  (3, 3)
mean f1-score:  0.725016155547
std f1-score: 0.0172974110334 

ngram range:  (1, 3)
mean f1-score:  0.925138255865
std f1-score: 0.0176059719963 



### 8)

In [9]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
for n_gram_range in [(1,1), (2,2), (3,3), (1,3)]:
    vectorizer = CountVectorizer(ngram_range = n_gram_range)
    X = df.sms
    X = vectorizer.fit_transform(X)
    res = cross_val_score(clf, X, Y, scoring="f1", cv=10)
    print("ngram range: ", n_gram_range)
    print("mean f1-score: ", np.mean(res))
    print("std f1-score:", np.std(res), "\n")

ngram range:  (1, 1)
mean f1-score:  0.927730355685
std f1-score: 0.0173219255889 

ngram range:  (2, 2)
mean f1-score:  0.645501517799
std f1-score: 0.0207880972743 

ngram range:  (3, 3)
mean f1-score:  0.378719485246
std f1-score: 0.0094506008815 

ngram range:  (1, 3)
mean f1-score:  0.888485965606
std f1-score: 0.0228955703578 



### 9)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.sms)
clf = LogisticRegression()
res = cross_val_score(clf, X, Y, scoring="f1", cv=10)
print("mean f1-score: ", np.mean(res))
print("std f1-score:", np.std(res))

mean f1-score:  0.852859955417
std f1-score: 0.0238364215221


### 10)

In [11]:
from stemming.porter2 import stem
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
import re

Применим стемминг (который, на самом деле, почти не влияет на качество здесь), предварительно заменив числа любого вида на 'number', а последовательность знаков $ на 'dollar'.

In [12]:
def stem_sms(sms):
    sms = re.sub('[0-9]+', 'number', sms)
    sms = re.sub('[$]+', ' dollar ', sms)
    sms = stem(sms)
    return sms

stemmed_X = list()
for sms in list(df.sms):
    stemmed_X.append(stem_sms(sms))
stemmed_X = np.array(stemmed_X)

Будем игнорировать стоп-слова. (это тоже почти не влияет на качество)

In [13]:
vectorizer = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1))
X = vectorizer.fit_transform(stemmed_X)

Найдем лучшее параметры и соответствующее качество, с помощью кросс-валидации. 

In [14]:
clf = LogisticRegression(max_iter=-1)
params = {'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'C' : [0.01, 0.1, 0.5, 1, 5, 10, 100, 200, 500, 1000, 10000, 15000, 20000, 100000, 200000, 500000, 1000000]}
clf = GridSearchCV(LogisticRegression(), params, cv=10, verbose=1, scoring="f1", n_jobs=-1)
clf.fit(X, Y)
print("baseline in 5th paragraph: 0.932640298361")
print("best score:", clf.best_score_)
print("best_params:", clf.best_params_)

Fitting 10 folds for each of 68 candidates, totalling 680 fits


[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 350 tasks      | elapsed:   26.7s


baseline in 5th paragraph: 0.932640298361
best score: 0.955120304834
best_params: {'C': 1000000, 'solver': 'lbfgs'}


[Parallel(n_jobs=-1)]: Done 680 out of 680 | elapsed:   55.9s finished


Удалось немного улучшить качество, основоной вклад в улучшение внесла замена классов слов на одно слово. Но все же на примерах из 6-ого пункта результат тот же:

In [16]:
best_clf = clf.best_estimator_
best_clf.fit(X, Y)
for sms in sms_to_predict:
    print(sms, best_clf.predict(vectorizer.transform([stem_sms(sms)])))

FreeMsg: Txt: CALL to  No: 86888 & claim your reward of 3 hours talk time to use from your phone   now! Subscribe6GB [1]
FreeMsg: Txt: claim your reward of 3 hours talk time [1]
Have you visited the last lecture on physics? [0]
Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$ [0]
Only 99$ [0]


### 11)  Выводы

Основной посыл, которое несет это задание, на мой взгляд, что не всегда более сложный подход ведет к лучшему качеству, бывает что наоборот. Иногда для улучшения качества необходимо исходить из конкретной задачи и придумывать что-то свое, какую-нибудь эвристику. Ну и нужно не забывать о подборе гиперпараметров.