# Application - Working with Text

Goal: Classify each document to the appropriate newsgroup using clustering using a bag of words type model.

We expect certain words to capture the semantic properties of a document. Word frequency was a way of quantifying this. For example, a document relating to python might contain words like 'class', 'def', and 'init' and documents such as snakes might contains words like 'afraid', 'slither', or 'bite'.

In [3]:
import nltk
import numpy as np
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer


def lemmatize_and_clean(doc):
    """
    obj: tokenize, filter, and lemmatize document
    """
    names_set = set(names.words())
    lemmatizer = WordNetLemmatizer()
    f1 = [word.lower() for word in nltk.tokenize.word_tokenize(doc)]
    f2 = [word for word in f1 if word.isalpha() and word not in names_set]
    return ' '.join(lemmatizer.lemmatize(word) for word in f2)


# fetch and load the data
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
labels = dataset.target

# we happen to know the true labels in advance
true_num_clusters = np.unique(labels).shape[0]

# data cleaning and preprocessing
dataset = [lemmatize_and_clean(post) for post in dataset.data]

# transform text into numerical features and run dimensionality reduction
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
svd = TruncatedSVD()
normalizer = Normalizer(copy=False)
pl = make_pipeline(vectorizer, svd, normalizer)
X = pl.fit_transform(dataset)

km = KMeans(n_clusters=true_num_clusters)
km.fit(X)

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Score: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))
