# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [41]:
from pprint import pprint
from time import time
import logging
from gensim.models import KeyedVectors, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [30]:
#!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-11-08 03:25:11--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolviendo s3.amazonaws.com (s3.amazonaws.com)... 52.216.142.38
Conectando con s3.amazonaws.com (s3.amazonaws.com)[52.216.142.38]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 1647046227 (1.5G) [application/x-gzip]
Grabando a: «GoogleNews-vectors-negative300.bin.gz»


2021-11-08 03:27:43 (10.4 MB/s) - «GoogleNews-vectors-negative300.bin.gz» guardado [1647046227/1647046227]



In [2]:
#!gzip -d GoogleNews-vectors-negative300.bin.gz

In [3]:
w2v_model = KeyedVectors.load_word2vec_format("/Users/jjimenez/Documents/Estudio/College/George Brown/Semester I/ML-II/Course Notes/Module 3 - Text Classification/GoogleNews-vectors-negative300.bin", binary=True)

In [24]:
# Creating a feature vector by averaging all embeddings for all sentences 
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this += 1
        feats.append(feat_for_this/count_for_this)
    return feats

### Choose a few categories for the entire 20 categories

In [7]:
# Load some categories from the training set
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware']

In [8]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']


### Fetch documents for these 2 categories

In [9]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")

2823 documents
5 categories


In [42]:
test = fetch_20newsgroups(subset='test', categories=categories)
print(f"{len(test.filenames)} documents")
print(f"{len(test.target_names)} categories")

1879 documents
5 categories


In [10]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [28]:
# Training for w2v
train_vectors = embedding_feats(data.data)
print(len(train_vectors))

2823


## CountVectorizer

In [98]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.data)
X_train_counts.shape

(2823, 66090)

In [99]:
count_vect.vocabulary_.get(u'algorithm')

15212

## TF-IDF

In [100]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2823, 66090)

In [102]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2823, 66090)

## Training a classifier

In [104]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, data.target)

In [109]:
docs_new = ['the Apple Ergo-Mouse', 'Alpha is WAY slow at 66 MHz']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, data.target_names[category]))

'the Apple Ergo-Mouse' => comp.sys.mac.hardware
'Alpha is WAY slow at 66 MHz' => comp.sys.ibm.pc.hardware


In [110]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [111]:
text_clf.fit(data.data, data.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [112]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8435337945715806

In [114]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
                      alpha=1e-3, random_state=42,
                      max_iter=5, tol=None)),
                      ])

text_clf.fit(data.data, data.target)

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8531133581692389

In [115]:
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression()),
])

text_clf.fit(data.data, data.target)

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8435337945715806

In [116]:
from sklearn.tree import DecisionTreeClassifier
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', DecisionTreeClassifier()),
])

text_clf.fit(data.data, data.target)

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.6599254922831294

## Doc2Vec

In [225]:
# Prepare training data in doc2vec format:
d2vtrain = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(data.data)]
# Train a doc2vec model to learn tweet representations
model = Doc2Vec(dm=1, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, alpha=0.025, min_alpha=0.001)
model.build_vocab(d2vtrain)
model.train(d2vtrain, total_examples=model.corpus_count, epochs=model.epochs)

In [226]:
d2vtest = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(data.target)]

In [227]:
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector([doc.words])) for doc in sents])
    return targets, feature_vectors

In [228]:
from sklearn import utils
train_documents  = utils.shuffle(d2vtrain)
model.train(train_documents,total_examples=len(train_documents), epochs=5)

In [229]:
test_documents = utils.shuffle(d2vtest)

In [230]:
y_train, X_train = vector_for_learning(model, train_documents)

In [None]:
y_test, X_test = vector_for_learning(model, test_documents)

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=2000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [218]:
print('Accuracy: %s' % accuracy_score(y_test, y_pred))

Accuracy: 0.0


### Define a pipeline combining a text feature extractor with a simple classifier

In [128]:
# Change classifiers to Random Forest, Logistic Regression, Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()) 
])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [136]:
parameters = {
    'tfidf__use_idf': (True, False),  
    'tfidf__sublinear_tf': (True, False),  
    'tfidf__norm': ('l1', 'l2'),  
    'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
    } 

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [137]:
grid = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

### Start the grid search

In [138]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 1e-05),
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__sublinear_tf': (True, False),
                         'tfidf__use_idf': (True, False)},
             verbose=1)

### Best Score

In [140]:
print("Best score: %0.3f" % grid.best_score_)

Best score: 0.910


### Best Parameter

In [141]:
print("Best parameters set:")
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__alpha: 0.001
	tfidf__norm: 'l1'
	tfidf__sublinear_tf: True
	tfidf__use_idf: False


In [147]:
# Change classifiers to Random Forest, Logistic Regression, Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge',  penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)) 
])

In [148]:
grid = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

In [149]:
grid.fit(data.data, data.target)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        SGDClassifier(alpha=0.001, max_iter=5,
                                                      random_state=42,
                                                      tol=None))]),
             n_jobs=-1,
             param_grid={'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 1e-05),
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__sublinear_tf': (True, False),
                         'tfidf__use_idf': (True, False)},
             verbose=1)

In [150]:
print("Best score: %0.3f" % grid.best_score_)

Best score: 0.919


In [151]:
print("Best parameters set:")
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__alpha: 0.001
	tfidf__norm: 'l2'
	tfidf__sublinear_tf: True
	tfidf__use_idf: True


### Choose the best model

In [12]:
grid.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.75)),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(max_iter=20))])

### Use the model to classify a piece of text

In [152]:
docs_new = ['the Apple Ergo-Mouse', 'Alpha is WAY slow at 66 MHz']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, data.target_names[category]))

'the Apple Ergo-Mouse' => comp.sys.mac.hardware
'Alpha is WAY slow at 66 MHz' => comp.sys.ibm.pc.hardware
