In [1]:
import sklearn

from nltk.corpus import reuters
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
label_binarizer = MultiLabelBinarizer()
stop_words = stopwords.words("english")
count_vectorizer = CountVectorizer(stop_words = stop_words)
hashing_vectorizer = HashingVectorizer(stop_words = stop_words, alternate_sign = False)
tfidf_vectorizer = TfidfVectorizer(stop_words = stop_words)

In [3]:
doc_id = reuters.fileids()
train_id = [d for d in doc_id if d.startswith('training/')]
test_id = [d for d in doc_id if d.startswith('test/')]
    
train_data = [reuters.raw(doc_id) for doc_id in train_id]
test_data = [reuters.raw(doc_id) for doc_id in test_id]

train_label = [reuters.categories(doc_id) for doc_id in train_id]
test_label = [reuters.categories(doc_id) for doc_id in test_id]

train_label = label_binarizer.fit_transform(train_label)
test_label = label_binarizer.transform(test_label)

In [4]:
train_hashing = hashing_vectorizer.fit_transform(train_data)
test_hashing = hashing_vectorizer.transform(test_data)

train_tfidf = tfidf_vectorizer.fit_transform(train_data)
test_tfidf = tfidf_vectorizer.transform(test_data)

train_bow = count_vectorizer.fit_transform(train_data)
test_bow = count_vectorizer.transform(test_data)

In [5]:
KNN_hashing = KNeighborsClassifier()
KNN_hashing.fit(train_hashing, train_label)

DT_hashing = DecisionTreeClassifier()
DT_hashing.fit(train_hashing, train_label)

print 'KNN: ', KNN_hashing.score(test_hashing, test_label)
print 'DT: ', DT_hashing.score(test_hashing, test_label)

KNN:  0.7416363034117257
DT:  0.7502484266313348


In [6]:
KNN_tfidf = KNeighborsClassifier()
KNN_tfidf.fit(train_tfidf, train_label)

DT_tfidf = DecisionTreeClassifier()
DT_tfidf.fit(train_tfidf, train_label)

print 'KNN: ', KNN_tfidf.score(test_tfidf, test_label)
print 'DT: ', DT_tfidf.score(test_tfidf, test_label)

KNN:  0.7303742961245445
DT:  0.7360052997681351


In [7]:
KNN_bow = KNeighborsClassifier()
KNN_bow.fit(train_bow, train_label)

DT_bow = DecisionTreeClassifier()
DT_bow.fit(train_bow, train_label)

print 'KNN: ', KNN_tfidf.score(test_bow, test_label)
print 'DT: ', DT_tfidf.score(test_bow, test_label)

KNN:  0.7257369990062935
DT:  0.665120900960583
