In [4]:
import numpy as np
import gensim
import smart_open
import os
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import PCA
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from palmettopy.palmetto import Palmetto
palmetto = Palmetto()

In [5]:
docs = []
labels = []

with open('./cluster/SearchSnippets.txt','r') as d_f:
    for line in d_f:
        if line != '\n':
            docs.append(line)
    
with open('./cluster/SearchSnippets_label.txt', 'r') as l_f:
    for line in l_f:
        if line != '\n':
            labels.append(int(line))
    

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [8]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=5, no_above=0.5)

In [9]:
with open('doc_info.txt', 'w', encoding='utf-8') as f:
    for doc in docs:
        f.write(' '.join(doc) + '\n')

In [10]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [11]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 3912
Number of documents: 12295


In [19]:
# Set training parameters.
num_topics = 8
chunksize = 2000
passes = 20
iterations = 100
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [20]:
top_topics = model.top_topics(corpus, topn=10) #, num_words=20)

In [21]:
topics = []
for i in range(len(top_topics)):
    a = [x[1] for x in top_topics[i][0]]
    topics.append(a)
    for x in a:
        print(x, end=' ')
    print()

research science edu journal theory page information paper university theoretical 
art culture system engine history home information fitness music page 
sport news football com match hockey team club rugby volleyball 
health business information gov news union social job service disease 
wikipedia tennis encyclopedia wiki wikipedia_wiki wikipedia_encyclopedia political basketball yahoo wimbledon 
game com tournament online school amazon book university espn ticket 
soccer player computer world system score cup software internet republic 
movie film imdb award equipment space resource gym electrical forum 


In [5]:
train_corpus = [gensim.models.doc2vec.TaggedDocument(tokens, [i]) for i, tokens in enumerate(docs)]

In [10]:
model2 = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=40)

In [7]:
model2.build_vocab(train_corpus)

In [8]:
model2.train(train_corpus, total_examples=model2.corpus_count, epochs=model2.epochs)

In [9]:
docvecs = [model2.docvecs[i] for i in range(len(docs))]

In [42]:
kmeans = KMeans(n_clusters=8)

In [43]:
kmeans.fit(docvecs)
predicted = kmeans.predict(docvecs)

In [44]:
cluster_dict = {}
for i,label in enumerate(predicted):
    if label in cluster_dict:
        cluster_dict[label].append(i)
    else:
        cluster_dict[label] = [i]

In [18]:
def top_words(cluster_dict, dictionary, word_num=10):
    all_dict = []
    topics = []
    for x in range(len(cluster_dict)):
        group0 = [dictionary.doc2idx(docs[i]) for i in cluster_dict[x]]
        topic_word_count = {}
        for doc in group0:
            for x in doc:
                if x in topic_word_count:
                    topic_word_count[x] += 1
                else:
                    topic_word_count[x] = 1
        all_dict.append(topic_word_count)
        
    for topic_word_count in all_dict:
        totoal_count = sum(topic_word_count.values())
        tf_idf = {}
        for w in topic_word_count.keys():
            dfs = 0
            for dic in all_dict:
                if w in dic:
                    dfs += 1
            try:
                tf = topic_word_count[w] / totoal_count
                idf = np.log(len(cluster_dict) / (dfs))
                tf_idf[w] = tf * idf
            except KeyError:
                continue
        a = sorted(tf_idf.items(), key=lambda x:x[1], reverse=True)[:word_num]
        topic0 = [dictionary[i[0]] for i in a]
        topics.append(topic0)
        print(' '.join(topic0))
    return topics

In [45]:
topics = top_words(cluster_dict, dictionary)

commodity tax medicare fda insurance tariff fund_budget agency union venture
medicare cba union minnesota referee senator economic_development public_health dentist agency
cisco mozilla wireless_access client_server zdnet microprocessor ibm sourceforge mspx cache
bbc allposters commodity sportsline lyric chron boxing cbs bull forbes
britannica union britannica_article descartes communism encyclopaedia_britannica westminster socialism meaning fluid
sewing bull sewing_machine chicago_bull speed_test stock_quote tiger commodity client_server ticket
stanford_edu mit lecture ocw optic einstein aristotle wolfram maa reasoning
lyric girl youtube movie_episode episode piano soundtrack olympic tiger favorite


In [80]:
cps = []
cas = []
sums = []
for i,topic in enumerate(a):
    cp = palmetto.get_coherence(topic,coherence_type="cp")
    ca = palmetto.get_coherence(topic, coherence_type="ca")
    cps.append(cp)
    cas.append(ca)
    allsum = cp+ca
    sums.append(allsum)
    topic = " ".join(topic)
    print(r"{} & {} & {:.3f} & {:.3f} & {:.3f}\\\hline".format(i+1, topic, cp, ca, allsum))
sum_cp = sum(cps)
sum_ca = sum(cas)
print(r"& & {:.3f} & {:.3f} & {:.3f}\\\hline".format(sum_cp, sum_ca, sum_cp+sum_ca))

1 & research edu science journal theory school university information computer program & 0.424 & 0.256 & 0.680\\\hline
2 & movie com music art amazon culture book film video news & 0.316 & 0.248 & 0.564\\\hline
3 & computer software web system memory programming internet com intel device & 0.249 & 0.217 & 0.466\\\hline
4 & wikipedia political encyclopedia system party wiki wikipedia_wiki democracy wikipedia_encyclopedia government & 0.095 & 0.181 & 0.276\\\hline
5 & sport news game football com soccer world match league ticket & 0.360 & 0.254 & 0.614\\\hline
6 & business market news service stock trade job information home finance & 0.294 & 0.174 & 0.468\\\hline
7 & health information gov cancer news research disease medical drug national & 0.426 & 0.227 & 0.653\\\hline
8 & car engine calorie wheel motor electrical income tax model automatic & 0.103 & 0.232 & 0.335\\\hline
& & 2.266 & 1.790 & 4.056\\\hline


In [24]:
research edu science journal theory school university information computer program
movie com music art amazon culture book film video news
computer software web system memory programming internet com intel device
wikipedia political encyclopedia system party wiki wikipedia_wiki democracy wikipedia_encyclopedia government
sport news game football com soccer world match league ticket
business market news service stock trade job information home finance
health information gov cancer news research disease medical drug national
car engine calorie wheel motor electrical income tax model automatic

SyntaxError: EOL while scanning string literal (<ipython-input-24-fdfdea0d1edf>, line 1)

In [70]:
a = []

In [78]:
a.append("car engine calorie wheel motor electrical income tax model automatic".split())

In [81]:
import torch

In [89]:
torch.zeros(3,1)

tensor([[0.],
        [0.],
        [0.]])

In [86]:
torch.zero_()

TypeError: zero_() missing 1 required positional arguments: "input"

In [84]:
a

tensor([0.0907, 0.3435])