In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'text'])
data.label = data.label.apply(lambda x : 1 if x == 'spam' else 0)

In [3]:
data.describe()

Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [4]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
vectorizer = CountVectorizer()
y_true = data.label
X = vectorizer.fit_transform(data.text.values)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

In [7]:
log_reg_clf = LogisticRegression()
cv_score = cross_val_score(log_reg_clf, X, y_true, cv=10, scoring='f1')
print (cv_score)
print (np.mean(cv_score))

[ 0.95890411  0.89855072  0.91549296  0.95833333  0.93706294  0.91304348
  0.94444444  0.92753623  0.92198582  0.95104895]
0.932640298361


In [8]:
test_text = [
"FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours\
 talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book\
 and you will have all materials! Only 99$",
"Only 99$"]

In [9]:
log_reg_clf.fit(X, y_true)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
test_text = vectorizer.transform(test_text)

In [11]:
y_pred = log_reg_clf.predict(test_text)

In [12]:
print (*y_pred)

1 1 0 0 0


In [13]:
for ngramm in [(2,2), (3,3), (1,3)]:
    vectorizer.set_params(ngram_range=ngramm)
    X = vectorizer.fit_transform(data.text.values)
    cv_score = np.mean(cross_val_score(log_reg_clf, X, y_true, cv=10, scoring='f1'))
    print (np.mean(cv_score))

0.822422066419
0.725016155547
0.925138255865


In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
nb_clf = MultinomialNB()
for ngramm in [(2,2), (3,3), (1,3)]:
    vectorizer.set_params(ngram_range=ngramm)
    X = vectorizer.fit_transform(data.text.values)
    cv_score = np.mean(cross_val_score(nb_clf, X, y_true, cv=10, scoring='f1'))
    print (np.mean(cv_score))

0.645501517799
0.378719485246
0.888485965606


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tf_idf = TfidfVectorizer(ngram_range=(1,1))
X = tf_idf.fit_transform(data.text.values)
cv_score = np.mean(cross_val_score(log_reg_clf, X, y_true, cv=10, scoring='f1'))
print (np.mean(cv_score))

0.852859955417


# Повышение качества на cv

In [18]:
from sklearn.grid_search import GridSearchCV

In [19]:
gs_params = {
    'penalty' : ['l1', 'l2'],
    'C' : [0.1, 1., 10., 100.],
    'fit_intercept' : [0, 1]
}
clf = LogisticRegression(max_iter=1000, random_state=427, n_jobs=-1, class_weight='balanced')

In [20]:
gs = GridSearchCV(clf, gs_params, scoring='f1', cv=5)

In [21]:
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words='english')
y_true = data.label
X = vectorizer.fit_transform(data.text.values)

In [22]:
gs.fit(X, y_true)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=427,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0], 'fit_intercept': [0, 1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [23]:
gs.grid_scores_

[mean: 0.83156, std: 0.02212, params: {'penalty': 'l1', 'C': 0.1, 'fit_intercept': 0},
 mean: 0.88337, std: 0.01090, params: {'penalty': 'l2', 'C': 0.1, 'fit_intercept': 0},
 mean: 0.87787, std: 0.01175, params: {'penalty': 'l1', 'C': 0.1, 'fit_intercept': 1},
 mean: 0.91963, std: 0.01566, params: {'penalty': 'l2', 'C': 0.1, 'fit_intercept': 1},
 mean: 0.88028, std: 0.01165, params: {'penalty': 'l1', 'C': 1.0, 'fit_intercept': 0},
 mean: 0.90772, std: 0.00960, params: {'penalty': 'l2', 'C': 1.0, 'fit_intercept': 0},
 mean: 0.92579, std: 0.01025, params: {'penalty': 'l1', 'C': 1.0, 'fit_intercept': 1},
 mean: 0.92143, std: 0.01380, params: {'penalty': 'l2', 'C': 1.0, 'fit_intercept': 1},
 mean: 0.90034, std: 0.01317, params: {'penalty': 'l1', 'C': 10.0, 'fit_intercept': 0},
 mean: 0.89486, std: 0.00939, params: {'penalty': 'l2', 'C': 10.0, 'fit_intercept': 0},
 mean: 0.92421, std: 0.00545, params: {'penalty': 'l1', 'C': 10.0, 'fit_intercept': 1},
 mean: 0.91729, std: 0.01436, params: {'

In [24]:
gs.best_params_, gs.best_score_

({'C': 100.0, 'fit_intercept': 1, 'penalty': 'l1'}, 0.93704928903258045)

# Выводы

В задачах, связанных с распознаваниями текстов, лучше делать некую фильтрацию признаков (благодаря этому мы сможем избавиться от очень частых или очень редких слов, например). Если использовать в качестве признаков не сами слова, а n-граммы, это дает как правило лучшее качество, т.к. изначально мы имеем больше информации о том, какие слова встречаются рядом, например.