In [3]:
import numpy as np
import gensim
import smart_open
import os
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import PCA
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from palmettopy.palmetto import Palmetto
palmetto = Palmetto()

In [5]:
docs = []
labels = []

with open('./cluster/SearchSnippets.txt','r') as d_f:
    for line in d_f:
        if line != '\n':
            docs.append(line)
    
with open('./cluster/SearchSnippets_label.txt', 'r') as l_f:
    for line in l_f:
        if line != '\n':
            labels.append(int(line))
    

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [7]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=2, no_above=0.5)

In [42]:
with open('doc_info.txt', 'w', encoding='utf-8') as f:
    for doc in docs:
        f.write(' '.join(doc) + '\n')

In [31]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [32]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 4294
Number of documents: 12295


In [36]:
# Set training parameters.
num_topics = 8
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [37]:
top_topics = model.top_topics(corpus, topn=10) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -6.1084.
[([(0.027648749, 'science'),
   (0.025220705, 'edu'),
   (0.02249564, 'university'),
   (0.020326843, 'research'),
   (0.016762562, 'school'),
   (0.015941538, 'computer'),
   (0.015501143, 'journal'),
   (0.0139552485, 'culture'),
   (0.013372158, 'art'),
   (0.0110983625, 'program')],
  -3.125355490676686),
 ([(0.06994312, 'sport'),
   (0.062722966, 'game'),
   (0.040915854, 'wikipedia'),
   (0.028579878, 'com'),
   (0.023078991, 'tennis'),
   (0.021796566, 'tournament'),
   (0.021678928, 'encyclopedia'),
   (0.020678993, 'wiki'),
   (0.020116704, 'wikipedia_wiki'),
   (0.019209446, 'wikipedia_encyclopedia')],
  -4.278761410348151),
 ([(0.021935984, 'theory'),
   (0.019099936, 'information'),
   (0.018759515, 'page'),
   (0.018291483, 'ticket'),
   (0.017691605, 'gov'),
   (0.01688735, 'home'),
   (0.014780757, 'home_page'),
   (0.013993921, 'edu'),
   (0.013960634, 'library'),
   (0.01340576, 'physic')],
  -5.389017553090616),
 ([(0.033694528, 'heal

In [38]:
for i in range(len(top_topics)):
    a = [x[1] for x in top_topics[i][0]]
    for x in a:
        print(x, end=' ')
    print()

science edu university research school computer journal culture art program 
sport game wikipedia com tennis tournament encyclopedia wiki wikipedia_wiki wikipedia_encyclopedia 
theory information page ticket gov home home_page edu library physic 
health play music information job article com kid news healthy 
research business software theoretical service data internet network military product 
match movie yahoo amazon com book directory amazon_com engine film 
player news com union market online google poker home car 
football news soccer political team rugby party world democracy league 


In [8]:
train_corpus = [gensim.models.doc2vec.TaggedDocument(tokens, [i]) for i, tokens in enumerate(docs)]

In [23]:
model2 = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=2, epochs=40)

In [24]:
model2.build_vocab(train_corpus)

In [25]:
model2.train(train_corpus, total_examples=model2.corpus_count, epochs=model2.epochs)

In [26]:
docvecs = [model2.docvecs[i] for i in range(len(docs))]

In [27]:
kmeans = KMeans(n_clusters=8)

In [28]:
kmeans.fit(docvecs)
predicted = kmeans.predict(docvecs)

In [29]:
cluster_dict = {}
for i,label in enumerate(predicted):
    if label in cluster_dict:
        cluster_dict[label].append(i)
    else:
        cluster_dict[label] = [i]

In [21]:
def top_words(cluster_dict, dictionary，word_num=10):
    all_dict = []
    topics = []
    for x in range(len(cluster_dict)):
        group0 = [dictionary.doc2idx(docs[i]) for i in cluster_dict[x]]
        topic_word_count = {}
        for doc in group0:
            for x in doc:
                if x in topic_word_count:
                    topic_word_count[x] += 1
                else:
                    topic_word_count[x] = 1
        all_dict.append(topic_word_count)
        
    for topic_word_count in all_dict:
        totoal_count = sum(topic_word_count.values())
        tf_idf = {}
        for w in topic_word_count.keys():
            dfs = 0
            for dic in all_dict:
                if w in dic:
                    dfs += 1
            try:
                tf = topic_word_count[w] / totoal_count
                idf = np.log(len(cluster_dict) / (dfs))
                tf_idf[w] = tf * idf
            except KeyError:
                continue
        a = sorted(tf_idf.items(), key=lambda x:x[1], reverse=True)[:word_num]
        topic0 = [dictionary[i[0]] for i in a]
        topics.append(topic0)
        print(' '.join(topic0))
    return topics

In [30]:
top_words(cluster_dict, dictionary)

wikipedia news information com health research movie system business political
amazon com game movie book system research computer news amazon_com
news business information research health com movie yahoo school science
research gov information health news cancer national system economic government
computer com software system product intel information web research news
research science edu theory journal computer physic course university information
news com music movie sport game football world video online
wikipedia encyclopedia wikipedia_encyclopedia wiki wikipedia_wiki system political democracy culture article


In [2]:
palmetto.get_coherence("wikipedia news information com health research movie system business political".split(),coherence_type="cv")

0.34044923952367706

In [4]:
import torch

In [5]:
torch.zeros(2,3)

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [7]:
a = np.zeros((2,3))

In [10]:
a[1][1] = 1.

In [11]:
a

array([[0., 0., 0.],
       [0., 1., 0.]])