# topic model


Document Clustering with Python
ref: http://brandonrose.org/clustering

In [10]:
import numpy as np
import pandas as pd
# import nltk
import re
import os
import codecs
from sklearn import feature_extraction
# import mpld3

import nltk
nltk.download('punkt')

df = pd.read_csv('data/treated_data/TerrestrialReport_lc_ld_mammals.txt',sep="|", dtype='unicode')

[nltk_data] Downloading package punkt to /Users/zireael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [12]:
list(df)

['CAS Number ',
 ' Chemical Name',
 'Chemical Grade',
 ' Chemical Purity Mean Op ',
 ' Chemical Purity Mean(%) ',
 ' Chemical Purity Min Op ',
 ' Chemical Purity Min(%) ',
 ' Chemical Purity Max Op ',
 ' Chemical Purity Max(%)',
 'Species Group',
 'Species Scientific Name ',
 ' Species Common Name',
 'Organism Lifestage',
 ' Organism Age Mean Op ',
 ' Organism Age Mean ',
 ' Organism Age Min Op ',
 ' Organism Age Min ',
 ' Organism Age Max Op ',
 ' Organism Age Max ',
 ' Organism Age Units',
 'Media Type',
 'Test Location',
 'Exposure Type',
 'Dose Number',
 'Endpoint',
 'Effect',
 'Effect Measurement',
 'Response Site',
 'Chemical Analysis Method',
 'Conc 1 Type (Author) ',
 ' Conc 1 Mean Op (Author) ',
 ' Conc 1 Mean (Author) ',
 ' Conc 1 Min Op (Author) ',
 ' Conc Min 1 (Author) ',
 ' Conc 1 Max Op (Author) ',
 ' Conc 1 Max (Author) ',
 ' Conc 1 Units (Author) ',
 ' Conc 2 Type (Author) ',
 ' Conc 2 Mean Op (Author) ',
 ' Conc 2 Mean (Author) ',
 ' Conc 2 Min Op (Author) ',
 ' Conc 

In [21]:
endpoints = df.Endpoint.unique()
endpoints

array(['LD05', 'LD50', 'LC50', 'LD0.1', 'LD01', 'LD16', 'LD10', 'LD20',
       'LD80', 'LD90', 'LD0', 'LD100', 'LC100', 'LC80', 'LD50/', 'LD75/',
       'LD90/', 'LD95/', 'LD99/', 'LD95', 'LD99', 'LD84'], dtype=object)

In [14]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [15]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in df.Endpoint.unique():
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [18]:
vocab_frame = df
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 1787 items in vocab_frame


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.045, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(endpoints) #fit the vectorizer to synopses

print(df.shape)

CPU times: user 87.2 ms, sys: 2.31 ms, total: 89.5 ms
Wall time: 92 ms
(1787, 66)


In [34]:
terms = tfidf_vectorizer.get_feature_names()
terms

['lc100',
 'lc50',
 'lc80',
 'ld0',
 'ld0.1',
 'ld01',
 'ld05',
 'ld10',
 'ld100',
 'ld16',
 'ld20',
 'ld50',
 'ld50/',
 'ld75/',
 'ld80',
 'ld84',
 'ld90',
 'ld90/',
 'ld95',
 'ld95/',
 'ld99',
 'ld99/']

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
dist

array([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,

In [56]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 81.3 ms, sys: 13.2 ms, total: 94.4 ms
Wall time: 122 ms


In [59]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

doc_cluster = df.to_pickle("data/treated_data/TerrestrialReport_lc_ld_mammals.pkl")

joblib.dump(km, "data/treated_data/TerrestrialReport_lc_ld_mammals.pkl")

km = joblib.load('data/treated_data/TerrestrialReport_lc_ld_mammals.pkl')
clusters = km.labels_.tolist()

[2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [62]:
# frame = pd.DataFrame(df, index = [clusters])
# frame['cluster'].value_counts()

Unnamed: 0,CAS Number,Chemical Name,Chemical Grade,Chemical Purity Mean Op,Chemical Purity Mean(%),Chemical Purity Min Op,Chemical Purity Min(%),Chemical Purity Max Op,Chemical Purity Max(%),Species Group,...,Observed Duration Min (Days),Observed Duration Max Op (Days),Observed Duration Max (Days),Observed Duration Units (Days),Author,Reference Number,Title,Source,Publication Year,Unnamed: 65
0,50000,Formalin,"Technical grade, technical product, technical ...",,96.6,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Tsuda,S., T. Miyaoka, M. Iwasaki, and Y. Shirasu",104102,Pharmacokinetic Analysis of Increased Toxicity...,Fundam. Appl. Toxicol.4(5): 724-730,1984,
1,50000,Formalin,"Technical grade, technical product, technical ...",,96.6,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Tsuda,S., T. Miyaoka, M. Iwasaki, and Y. Shirasu",104102,Pharmacokinetic Analysis of Increased Toxicity...,Fundam. Appl. Toxicol.4(5): 724-730,1984,
2,50000,Formalin,"Technical grade, technical product, technical ...",,96.6,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Tsuda,S., T. Miyaoka, M. Iwasaki, and Y. Shirasu",104102,Pharmacokinetic Analysis of Increased Toxicity...,Fundam. Appl. Toxicol.4(5): 724-730,1984,
3,50000,Formalin,"Technical grade, technical product, technical ...",,96.6,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Tsuda,S., T. Miyaoka, M. Iwasaki, and Y. Shirasu",104102,Pharmacokinetic Analysis of Increased Toxicity...,Fundam. Appl. Toxicol.4(5): 724-730,1984,
4,50000,Formalin,"Technical grade, technical product, technical ...",,96.6,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Tsuda,S., T. Miyaoka, M. Iwasaki, and Y. Shirasu",104102,Pharmacokinetic Analysis of Increased Toxicity...,Fundam. Appl. Toxicol.4(5): 724-730,1984,
5,50000,Formalin,"Technical grade, technical product, technical ...",,96.6,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Tsuda,S., T. Miyaoka, M. Iwasaki, and Y. Shirasu",104102,Pharmacokinetic Analysis of Increased Toxicity...,Fundam. Appl. Toxicol.4(5): 724-730,1984,
6,50293,"1,1'-(2,2,2-Trichloroethylidene)bis[4-chlorobe...",Not reported,,NR,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Ahdaya,S.M., P.V. Shah, and F.E. Guthrie",35005,Thermoregulation in Mice Treated with Parathio...,Toxicol. Appl. Pharmacol.35:575-580,1976,
7,50293,"1,1'-(2,2,2-Trichloroethylidene)bis[4-chlorobe...",Not reported,,NR,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Ahdaya,S.M., P.V. Shah, and F.E. Guthrie",35005,Thermoregulation in Mice Treated with Parathio...,Toxicol. Appl. Pharmacol.35:575-580,1976,
8,50293,"1,1'-(2,2,2-Trichloroethylidene)bis[4-chlorobe...",Not reported,,NR,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Ahdaya,S.M., P.V. Shah, and F.E. Guthrie",35005,Thermoregulation in Mice Treated with Parathio...,Toxicol. Appl. Pharmacol.35:575-580,1976,
9,50351,"2-(2,6-Dioxo-3-piperidinyl)-1H-isoindole-1,3(2...",Not reported,,NR,,NR,,NR,Mammals,...,NR,,NR,Day(s),"Fickentscher,K., A. Kirfel, G. Will, and F. Ko...",91811,Stereochemical Properties and Teratogenic Acti...,Mol. Pharmacol.13(1): 133-141,1977,
