In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [2]:
def process_data(filename, themes_list, filepath):
    df = pd.read_excel(filepath +"/" + filename +".xlsx",'TrainingData')
    
    df_classes = pd.read_excel(filepath +"/" + filename +".xlsx",'Classes')['classes'].tolist()
    cleaned_themes = []
    cleaned_x = []
    for ind, row in df.iterrows():
        themes = []
        for th in themes_list:
            try:
                themes.append(df_classes.index(row[th]))
            except:
                print row[th]
        if themes != []:
            cleaned_x.append((row['review']).encode('utf-8'))
            cleaned_themes.append(themes)
    return cleaned_x, cleaned_themes, df_classes

In [None]:
X_train, y_train,list_classes = process_data("Foresee_hl_clorox_train_data_white_paper", ["hl_themes"], "../../Downloads")

In [None]:
stopwords = nltk.corpus.stopwords.words('english')


In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
import pdb
def tokenize_and_stem(text):
    try:
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word.encode('utf-8') for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    except:
        pdb.set_trace()
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in X_train:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

In [None]:
print vocab_frame.head()
print
print
print
print

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=1, max_features=200000,
                                 min_df=0, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
print(tokenize_and_stem(synopses[0]))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print(tfidf_matrix[0])

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [None]:
# titles = range(len(synopses))
titles = [str(x) for x in range(len(synopses))]

In [None]:
films = { 'title': titles, 'synopsis': synopses, 'cluster': clusters }

frame = pd.DataFrame(films, index = [clusters] , columns = ['title', 'cluster'])

In [None]:
frame['cluster'].value_counts()

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
