In [6]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD,NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans
import gensim
import numpy as np

In [3]:
w2v_model=gensim.models.KeyedVectors.load_word2vec_format('../../../word2vec/GoogleNews-vectors-negative300.bin',binary=True)

In [7]:
def generate_clearcut_topics():
    ## for demostration purpose, don't take it personally : )
    return np.repeat(["we love bergers", "we hate sandwiches"], [1000, 1000])

def generate_unbalanced_topics():
    return np.repeat(["we love bergers", "we hate sandwiches"], [10, 1000])

def generate_semantic_context_topics():
    return np.repeat(["we love bergers"
                      , "we hate bergers"
                      , "we love sandwiches"
                      , "we hate sandwiches"], 1000)

def generate_noisy_topics():
    def _random_typos(word, n):
        typo_index = np.random.randint(0, len(word), n)
        return [word[:i]+"X"+word[i+1:] for i in typo_index]
    t1 = ["we love %s" % w for w in _random_typos("bergers", 15)]
    t2 = ["we hate %s" % w for w in _random_typos("sandwiches", 15)]
    return np.r_[t1, t2]

sample_texts = {
     "clearcut topics": generate_clearcut_topics()
    , "unbalanced topics": generate_unbalanced_topics()
    , "semantic topics": generate_semantic_context_topics()
    , "noisy topics": generate_noisy_topics()
}

In [8]:
sample_texts

{'clearcut topics': array(['we love bergers', 'we love bergers', 'we love bergers', ...,
        'we hate sandwiches', 'we hate sandwiches', 'we hate sandwiches'],
       dtype='<U18'),
 'unbalanced topics': array(['we love bergers', 'we love bergers', 'we love bergers', ...,
        'we hate sandwiches', 'we hate sandwiches', 'we hate sandwiches'],
       dtype='<U18'),
 'semantic topics': array(['we love bergers', 'we love bergers', 'we love bergers', ...,
        'we hate sandwiches', 'we hate sandwiches', 'we hate sandwiches'],
       dtype='<U18'),
 'noisy topics': array(['we love bergXrs', 'we love berXers', 'we love bergerX',
        'we love bergeXs', 'we love bergXrs', 'we love Xergers',
        'we love berXers', 'we love berXers', 'we love bergerX',
        'we love bergerX', 'we love Xergers', 'we love bXrgers',
        'we love bergeXs', 'we love Xergers', 'we love bergeXs',
        'we hate sanXwiches', 'we hate sandwicheX', 'we hate Xandwiches',
        'we hate sandwicX

In [10]:
from collections import Counter
for desc, texts in sample_texts.items():
    print(desc)
    print(Counter(texts).most_common())
    print()

clearcut topics
[('we love bergers', 1000), ('we hate sandwiches', 1000)]

unbalanced topics
[('we hate sandwiches', 1000), ('we love bergers', 10)]

semantic topics
[('we love bergers', 1000), ('we hate bergers', 1000), ('we love sandwiches', 1000), ('we hate sandwiches', 1000)]

noisy topics
[('we love berXers', 3), ('we love bergerX', 3), ('we love bergeXs', 3), ('we love Xergers', 3), ('we hate sanXwiches', 3), ('we hate sandwicXes', 3), ('we love bergXrs', 2), ('we hate sandwicheX', 2), ('we hate Xandwiches', 2), ('we hate sandwichXs', 2), ('we love bXrgers', 1), ('we hate sandXiches', 1), ('we hate sXndwiches', 1), ('we hate saXdwiches', 1)]



In [11]:
def find_topic(texts, topic_model, n_topics, vec_model="tf", thr=1e-2, **kwargs):
    """Return a list of topics from texts by topic models - for demostration of simple data
    texts: array-like strings
    topic_model: {"nmf", "svd", "lda", "kmeans"} for LSA_NMF, LSA_SVD, LDA, KMEANS (not actually a topic model)
    n_topics: # of topics in texts
    vec_model: {"tf", "tfidf"} for term_freq, term_freq_inverse_doc_freq
    thr: threshold for finding keywords in a topic model
    """
    ## 1. vectorization
    vectorizer = CountVectorizer() if vec_model == "tf" else TfidfVectorizer()
    text_vec = vectorizer.fit_transform(texts)
    words = np.array(vectorizer.get_feature_names())
    ## 2. topic finding
    topic_models = {"nmf": NMF, "svd": TruncatedSVD, "lda": LatentDirichletAllocation, "kmeans": KMeans}
    topicfinder = topic_models[topic_model](n_topics, **kwargs).fit(text_vec)
    topic_dists = topicfinder.components_ if topic_model is not "kmeans" else topicfinder.cluster_centers_
    topic_dists /= topic_dists.max(axis = 1).reshape((-1, 1))   
    ## 3. keywords for topics
    ## Unlike other models, LSA_SVD will generate both positive and negative values in topic_word distribution,
    ## which makes it more ambiguous to choose keywords for topics. The sign of the weights are kept with the
    ## words for a demostration here
    def _topic_keywords(topic_dist):
        keywords_index = np.abs(topic_dist) >= thr
        keywords_prefix = np.where(np.sign(topic_dist) > 0, "", "^")[keywords_index]
        keywords = " | ".join(map(lambda x: "".join(x), zip(keywords_prefix, words[keywords_index])))
        return keywords
    
    topic_keywords = map(_topic_keywords, topic_dists)
    return "\n".join("Topic %i: %s" % (i, t) for i, t in enumerate(topic_keywords))

In [12]:
print(find_topic(sample_texts["clearcut topics"], "svd", 4, vec_model="tf"))

Topic 0: bergers | hate | love | sandwiches | we
Topic 1: ^bergers | hate | ^love | sandwiches
Topic 2: ^bergers | ^hate | ^love | ^sandwiches | we
Topic 3: bergers | hate | ^love | ^sandwiches


In [13]:
print(find_topic(sample_texts["clearcut topics"], "svd", 4, vec_model="tfidf"))

Topic 0: bergers | hate | love | sandwiches | we
Topic 1: bergers | ^hate | love | ^sandwiches
Topic 2: bergers | hate | love | sandwiches | ^we
Topic 3: bergers | hate | ^love | ^sandwiches
