In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import heapq, numpy as np
import random
#!pip3 install gensim
from gensim import corpora, models
import logging

# Assignment 3

In [2]:
documents = []
dir_path = "awards_2002/"
root_dir = os.fsencode(dir_path)
for directory in os.listdir(root_dir):
    sub_directory = os.fsdecode(directory)
    current_path = dir_path + sub_directory + "/"
    
    for file in os.listdir(dir_path + sub_directory):
        with open(current_path + file, "r", encoding="utf-8", errors="ignore") as f:
            documents.append(f.read())
            
random.shuffle(documents)

## Functions

In [55]:
def feature_names(vectorizer, matrix):
    features = tfidf_vectorizer.get_feature_names()
    for doc_i in range(5):
        print("\nDocument %d, top terms by TF-IDF" % doc_i)
        for term, score in sorted(list(zip(features,matrix.toarray()[doc_i])), key=lambda x:-x[1])[:5]:
            print("%.2f\t%s" % (score, term))

In [56]:
def print_clusters(matrix, clusters, n_keywords=10):
    max_cluster = 10
    for cluster in range(min(clusters), max_cluster):
        cluster_docs = [i for i, c in enumerate(clusters) if c == cluster]
        print("Cluster: %d (%d docs)" % (cluster, len(cluster_docs)))
        
        # Keep scores for top n terms
        new_matrix = np.zeros((len(cluster_docs), matrix.shape[1]))
        for cluster_i, doc_vec in enumerate(matrix[cluster_docs].toarray()):
            for idx, score in heapq.nlargest(n_keywords, enumerate(doc_vec), key=lambda x:x[1]):
                new_matrix[cluster_i][idx] = score

        # Aggregate scores for kept top terms
        keywords = heapq.nlargest(n_keywords, zip(new_matrix.sum(axis=0), features))
        print(', '.join([w for s,w in keywords]))
        print()

## 1a Experiment with KMeans and hierarchial clustering

In [59]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, use_idf=True, sublinear_tf=True, max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
matrix_sample = tfidf_matrix[:1000]
km = KMeans(n_clusters=30, random_state=42, verbose=0)
km.fit(matrix_sample)
#print_clusters(matrix_sample, km.labels_)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=30, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [60]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, use_idf=True,max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
matrix_sample = tfidf_matrix[:1000]
z = linkage(matrix_sample.todense(), metric="cosine", method="complete")
clusters = fcluster(z, t=0.99, criterion="distance")
#print_clusters(matrix_sample, clusters)

### 1a results

#### Fcluster

* With a min_df of 1 a lot of numbers started popping up and multiple clusters with the same terms
* Higher min_df doesn't do much more than potentially hide "high value" terms
* Mostly good terms with a decent setup
* Small cluster size (# of docs) - related to the t in fcluster
* Method to euclidian instead of complete didn't give much benefit

#### KMeans

* Large clusters
* More numbers in the clusters (Potentially useless, potentially good ie. genes)
* Seems dependant on the random_state
* Higher than 2 min_df just leads to clusters that are too broad

--

In my experimentation I feel like the end-result that was best was the most recent hierarchial clustering. For one, none of the clusters had numbers which I atleast saw as a larger negative.

That said it has it's pros and cons as well. The clusters are considerably smaller in size compared to the KMeans clusters, where these are about 10 or so docs in size, the KMeans clusters seem to be around 25 or so. This is both good and bad in the sense that a smaller cluster most likely means that it's more specific, but it might also mean that it just made multiple clusters that are very similar.

As such I'll go with the fcluster that I have above. It uses


linkage(metric="cosine", method="complete")

fcluster(t=0.99, criterion="distance")

Changing the method only gave very similar or sparse clusters. The t value just made the clusters even smaller, to the point where a doc was basically its own cluster. The min_df and max_df seemed to be pretty optimal at these values, as changing them too much just made clusters too broad or made them have too many "bad" terms.

### 1b label the clusters

Copypaste the cluster just in case since i shuffle the docs at the start of the notebook.

* Cluster: 1 (7 docs) - **Electrical engineering**

multimedia, compiler, smt, hmd, asic, processors, ieee, multiuser, adaptable, fpga



* Cluster: 2 (8 docs) - **Software verification**

hybrid, verification, embedded, software, qos, certification, stanford, rtl, checking, device



* Cluster: 3 (7 docs) - **Continental drifting / Seafloor geography**

continental, rift, rifting, spreading, seafloor, extension, pilcomayo, gulf, deposits, rio



* Cluster: 4 (9 docs) - **Geography statistics**

mantle, antarctic, seismic, gps, geodetic, stations, fault, puget, permanent, recoverable



* Cluster: 5 (10 docs) - **Seismic activity?**

detachment, uplift, floreana, magmatic, tectonic, cordillera, arc, strike, mafic, plateau



* Cluster: 6 (15 docs) - **Thermodynamics**

equations, ergodic, differential, probability, volterra, singularities, hyperbolic, oscillations, boundary, partial



* Cluster: 7 (4 docs) - **Linear algebra**

spaces, operators, teichmueller, functions, operator, metric, hankel, toeplitz, green, holomorphic



* Cluster: 8 (13 docs) - **Algebraic topology**

manifolds, homotopy, dm, geometric, compact, algebras, surfaces, variables, ring, operators



* Cluster: 9 (4 docs) - **Deforestation & poor countries**

migrants, semantic, tenure, real, compositionality, semantics, migration, syntactic, web, deforestation

### 1c pick out 2 good and 2 bad clusters

Clusters 7 & 8 are both good in my opinion.

7 is a little small in size, however the terms are almost all related and for example the 3 names all correspond to functions related to algebra, and obviously functions are also in the picture.

8 is also grouped in a similar way, where the terms can all be related back to topology, where for example homotpoty and manifolds are both main branches of topology.

As for bad clusters, from these 10 I'd say it would be cluster 9 and cluster 3. (5 by extension)

Cluster 9 is simply too hard to interpret. It has a mix of very different terms that are hard to group together. It could be correlated to the Amazon rainforest and the deforestation there but where do semantics come into the picture there.

Cluster 3 in turn isn't that bad, however I feel like its too similar to that of cluster 5. THey're both related to seismic activity, and it's essentially just one being the seafloor, the other being mountains.

## 1d LDA modelling

In [9]:
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

dictionary = corpora.Dictionary(tokenized_text)
lda_corpus = [dictionary.doc2bow(text) for text in tokenized_text]
lda_model = models.LdaModel(lda_corpus, id2word=dictionary, num_topics=10)

In [10]:
# Inspect topics
for i, topic in lda_model.show_topics(num_words=50, formatted=False):
    print("Topic", i)
    printed_terms = 0
    for term, score in topic:
        if printed_terms >= 10:
            break
        elif term in "Award Investigator research this these will that the This of OF and to for in or The is be may an a with at are on by as from can".split():
            continue
        printed_terms += 1
        print("%.4f\t%s" % (score,term))
    print()

Topic 0
0.0075	Program
0.0071	NSF
0.0065	Estimated
0.0061	Date
0.0044	2002
0.0043	University
0.0040	2003
0.0039	current
0.0037	File
0.0036	Principal

Topic 1
0.0107	Program
0.0106	2002
0.0105	NSF
0.0098	Estimated
0.0094	Date
0.0087	current
0.0087	Principal
0.0055	Title
0.0054	Expires
0.0053	Prgm

Topic 2
0.0053	NSF
0.0048	current
0.0047	Program
0.0045	Estimated
0.0045	Date
0.0042	Principal
0.0038	2002
0.0031	project
0.0027	have
0.0025	their

Topic 3
0.0075	NSF
0.0074	Program
0.0072	Date
0.0068	Estimated
0.0062	current
0.0057	2002
0.0051	Principal
0.0039	Fld
0.0039	project
0.0038	Ref

Topic 4
0.0051	NSF
0.0050	Estimated
0.0047	Date
0.0045	Program
0.0044	current
0.0036	Principal
0.0036	2002
0.0028	Ref
0.0027	data
0.0026	Instr

Topic 5
0.0055	Program
0.0053	Date
0.0051	NSF
0.0051	Estimated
0.0038	2002
0.0036	Principal
0.0033	current
0.0029	have
0.0029	Abstract
0.0029	which

Topic 6
0.0046	NSF
0.0046	Date
0.0045	Program
0.0044	Estimated
0.0039	current
0.0035	Principal
0.0032	2002
0.0032	ma

In this case a lot of the topics seem to be very similar if not almost identical, however because of how LDA is intended to work this does make some sense. Since this modelling is designed so that a document can fall under multiple topics.

After removing some stopwords and also removing some terms that occured in every listed topic, you can see that there are some differences between the topics.

## 2. Word vectors

### 2a word2vec

In [11]:
seed_words = ["mathematics", "console", "spring", "technology", "communication"]
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

In [12]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size, window, min_count, iter, sg, negative
vectors = models.Word2Vec(tokenized_text)
most_similar(vectors_Default, seed_words)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 18:12:36,748 : INFO : collecting all words and their counts
2020-03-12 18:12:36,752 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 18:12:37,405 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 18:12:37,406 : INFO : Loading a fresh vocabulary
2020-03-12 18:12:37,495 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 18:12:37,496 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 18:12:37,566 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 18:12:37,569 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 18:12:37,570 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 18:12:37,634 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 18:12:37,634 : INFO :

NameError: name 'most_similar' is not defined

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, si)
most_similar(vectors_Default, seed_words)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

### 2b. elmo

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [65]:
!pip install tensorflow==1.15
!pip install "tensorflow_hub>=0.6.0"
!pip3 install tensorflow_text==1.15

import tensorflow as tf
import tensorflow_hub as hub

elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3",signature="default", as_dict=True trainable=True)

Collecting tensorflow==1.15
  Using cached https://files.pythonhosted.org/packages/92/2b/e3af15221da9ff323521565fa3324b0d7c7c5b1d7a8ca66984c8d59cb0ce/tensorflow-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl
Collecting gast==0.2.2 (from tensorflow==1.15)
Collecting tensorboard<1.16.0,>=1.15.0 (from tensorflow==1.15)
  Using cached https://files.pythonhosted.org/packages/1e/e9/d3d747a97f7188f48aa5eda486907f3b345cd409f0a0850468ba867db246/tensorboard-1.15.0-py3-none-any.whl
Collecting tensorflow-estimator==1.15.1 (from tensorflow==1.15)
  Using cached https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl
Installing collected packages: gast, tensorboard, tensorflow-estimator, tensorflow
  Found existing installation: gast 0.3.1
    Uninstalling gast-0.3.1:
[31mERROR: Could not install packages due to an EnvironmentError: [Errno 13] Permission denied: 'INSTALLER'
Consider using the `--user` op

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [61]:
def elmo_vectors(sents):
    embeddings = elmo(sents, )["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        return sess.run(embeddings)

In [66]:
def elmo_vectors(sents):
    embeddings = elmo(sents, signature="default", as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        return sess.run(embeddings)
        #sess.run(tf.tables_initializer())
        # return average of ELMo features as sentence vector
        #return sess.run(tf.reduce_mean(embeddings,1))

In [67]:
sents = """The game ended quickly .
He hunted some game for dinner .
A game of swans in the river .*
They played a game of chess .
They were in a baseball game .
She decided to eat som game .
Game can be found in forests .
Counterstrike is a popular game .
They didn't follow the game .
It was time to game .""".split('\n')

target = "game"

elmo_vecs = elmo_vectors(sents)
word_vecs = []
for i, sent in enumerate(sents):
    word_vecs.append(elmo_vecs[i][sent.split().index(target)])
    print("Sentence: ", sent)
    print("Vector for '%s:'" % target, word_vecs[-1])
    print()
    
print("Word vec size", word_vecs[0].shape)

TypeError: call() got an unexpected keyword argument 'signature'

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

vec_size = word_vecs[0].shape[0]
print("Similarities between '%s' vector in sentences:" % target)
for i in range(1, len(sents)):
    print("Sent 0-%d:" % i, cosine_similarity(word_vecs[0].reshape((1,vec_size)), 
                                              word_vecs[i].reshape((1,vec_size)))[0][0])

NameError: name 'word_vecs' is not defined