# Document Clustering by Topic

In [77]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from nltk.stem.snowball import SnowballStemmer
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from __future__ import print_function

import matplotlib.pyplot as plt
import matplotlib as mpl
from nltk.tag import pos_tag
from sklearn.manifold import MDS
from sklearn.externals import joblib

In [78]:
# Set path to the file with novels
path = "./Novels"

# Save all the titles of the texts
textName = []

# Save all the content of the texts
textContent = []

# Go to the directory with all the text files
for filename in os.listdir(path):
    
    # Add the file name and remove the file type (in this case ".txt")
    textName.append(filename[:-4])
    
    # Open each file and add all the content 
    with open(path + '/' + filename, "r") as file:
         fileContent = file.read()

    # Add the content of the file
    textContent.append(fileContent)
    

In [79]:
len(textName)

24

In [80]:
len(textContent)

24

In [81]:
# Apply stop words to all the text content

# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

# Use snowballer to break words into their roots
stemmer = SnowballStemmer("english")


In [82]:
# Tokenize and stem is used to break each token down into their base components. This is done to make it simplier
# on the algorthm later
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    filtered_tokens = []

    # Using regular expression break remove all token not containing letters
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    # Stem each word to their root word
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

# This is used to return stemmed words back to their original form. 
def tokenize_only(text):
    
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
            
    return filtered_tokens

In [83]:
text_stemmed = []
text_tokenized = []

# iterate through all the text
for work in textContent:
    
    # Store the text that has been stemmed into text stemmed
    alltext_stemmed = tokenize_and_stem(work)
    text_stemmed.extend(alltext_stemmed)
    
    # Store the all the text that has not been stemmed into text tokenized
    alltext_tokenized = tokenize_only(work)
    text_tokenized.extend(alltext_tokenized)

In [84]:
# Create a panda dataframe that has stemmed words as index and tokenized words as columns
# This is so that words like "happened, happening" are all mapped to the same index "happen"
vocab_frame = pd.DataFrame({'words': text_tokenized}, index = text_stemmed)

In [109]:
# max_df: max cut off for how frequent a term appears in the collection
# min_df: min start point to be considered a feature
# max_features: maximum amount of features that can exist
# Tokenizer: gave the previously defined tokenizer and stem program
# ngram_range: Declare that I want to look at unigrams, bigrams and trigrams

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(textContent)

print(tfidf_matrix.shape)

CPU times: user 2min 18s, sys: 172 ms, total: 2min 18s
Wall time: 2min 18s
(24, 28124)


In [110]:
dist = 1 - cosine_similarity(tfidf_matrix)
terms = tfidf_vectorizer.get_feature_names()

In [111]:
num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 7.89 s, sys: 84 ms, total: 7.98 s
Wall time: 4.05 s


In [112]:
joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [113]:
books = { 'title': textName, 'content': textContent, 'cluster': clusters }

frame = pd.DataFrame(books, index = [clusters] , columns = ['title', 'cluster', 'content'])


In [114]:
frame['cluster'].value_counts()

4    8
3    8
2    3
1    3
0    2
Name: cluster, dtype: int64

In [119]:
print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

clusterTerm = []

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')

    term = ''
    for ind in order_centroids[i, :6]:
        title = vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0]
        print(" " + title + ",")
        term = term + title + ", "
    print("\n\n")
    print("Cluster %d titles:" % i, end='')

    try:
        for title in frame.ix[i]['title'].values.tolist():
            print(' %s,' % title, end='')
    except:
        print (' %s' % frame.ix[i]['title'], end = '')
        
    clusterTerm.append(term)
    print("\n\n")

Top terms per cluster:

Cluster 0 words: catherine,
 s,
 elizabeth,
 jane,
 henry,
 charlotte,



Cluster 0 titles: Northanger Abbey, Pride and Prejudice,


Cluster 1 words: holmes,
 's,
 watson,
 said,
 n't,
 sherlock,



Cluster 1 titles: The Hound of the Baskervilles, The Adventures of Sherlock Holmes, The Sign of the Four,


Cluster 2 words: holmes,
 s,
 t,
 watson,
 mr.,
 lestrade,



Cluster 2 titles: The Valley of Fear, The Return of Sherlock Holmes, A Study In Scarlet,


Cluster 3 words: 's,
 n't,
 anne,
 susan,
 olive,
 professor,



Cluster 3 titles: Sense and Sensibility, Persuasion, Tales of Terror and Mystery, The Lost World, Bleak House, Oliver Twist, Lady Susan, A Christmas Carol,


Cluster 4 words: s,
 t,
 don,
 don,
 fanny,
 emma,



Cluster 4 titles: Emma, The Pickwick Papers, Our Mutual Friend, Mansfield Park, A Tale of Two Cities, Great Expectations, The Life And Adventure Of Nicholas Nickleby, David Copperfield,




In [120]:
clusterTerm

['catherine, s, elizabeth, jane, henry, charlotte, ',
 "holmes, 's, watson, said, n't, sherlock, ",
 'holmes, s, t, watson, mr., lestrade, ',
 "'s, n't, anne, susan, olive, professor, ",
 's, t, don, don, fanny, emma, ']

In [121]:
MDS()

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [122]:
# Strip away Proper Nouns
def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [123]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a'}

cluster_names = {}
keys = range(len(clusterTerm))



values = ["Hi", "I", "am", "John"]
for i in keys:
        dicts[i] = values[i]
print(dicts)

#set up cluster names using a dict
cluster_names = {0: 'Family, home, war', 
                 1: 'Police, killed, murders', 
                 2: 'Father, New York, brothers', 
                 3: 'Dance, singing, love', 
                 4: 'Killed, soldiers, captain'}