In [84]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import heapq, numpy as np
import random
#!pip3 install gensim
from gensim import corpora, models

  return f(*args, **kwds)


In [45]:
documents = []
dir_path = "awards_2002/"
root_dir = os.fsencode(dir_path)
for directory in os.listdir(root_dir):
    sub_directory = os.fsdecode(directory)
    current_path = dir_path + sub_directory + "/"
    
    for file in os.listdir(dir_path + sub_directory):
        with open(current_path + file, "r", encoding="utf-8", errors="ignore") as f:
            documents.append(f.read())
            
random.shuffle(documents)

## Functions

In [15]:
def feature_names(vectorizer, matrix):
    features = tfidf_vectorizer.get_feature_names()
    for doc_i in range(5):
        print("\nDocument %d, top terms by TF-IDF" % doc_i)
        for term, score in sorted(list(zip(features,matrix.toarray()[doc_i])), key=lambda x:-x[1])[:5]:
            print("%.2f\t%s" % (score, term))

In [34]:
def print_clusters(matrix, clusters, n_keywords=10):
    max_cluster = 10
    for cluster in range(min(clusters), max_cluster):
        cluster_docs = [i for i, c in enumerate(clusters) if c == cluster]
        print("Cluster: %d (%d docs)" % (cluster, len(cluster_docs)))
        
        # Keep scores for top n terms
        new_matrix = np.zeros((len(cluster_docs), matrix.shape[1]))
        for cluster_i, doc_vec in enumerate(matrix[cluster_docs].toarray()):
            for idx, score in heapq.nlargest(n_keywords, enumerate(doc_vec), key=lambda x:x[1]):
                new_matrix[cluster_i][idx] = score

        # Aggregate scores for kept top terms
        keywords = heapq.nlargest(n_keywords, zip(new_matrix.sum(axis=0), features))
        print(', '.join([w for s,w in keywords]))
        print()

## 1a Experiment with KMeans and hierarchial clustering

In [75]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, use_idf=True, sublinear_tf=True, max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
matrix_sample = tfidf_matrix[:1000]
km = KMeans(n_clusters=30, random_state=42, verbose=0)
km.fit(matrix_sample)
print_clusters(matrix_sample, km.labels_)

Cluster: 0 (44 docs)
jack, babcock, comparative, divisors, frp, clare, bimetallic, granularities, dipolar, graduating

Cluster: 1 (65 docs)
flatau, applies, paving, barbados, alkynyl, mails, hts, plurality, adel, kinds

Cluster: 2 (35 docs)
homozygous, mts, generalizable, dinosaurs, hyperbolicity, hoof, hood, additive, buphy, honing

Cluster: 3 (22 docs)
883, bolometer, lynn, lectures, natl, beatrice, france, biocontrol, nobel, 797

Cluster: 4 (55 docs)
diffusion, accepts, jimmy, darrell, lamps, guard, marguerite, diffusionless, coprocessor, autonoma

Cluster: 5 (36 docs)
689, divisors, hadjicostis, benjamin, cropping, interbasin, divorce, coakley, euler, incubators

Cluster: 6 (17 docs)
chou, icdp, darrell, 42600, darren, middleton, 1041, 200590001, interdisciplinary, 63132

Cluster: 7 (31 docs)
gibbs, daytona, fm, indians, karen_fischer, 6276, logger, constitution, fllwshp, bivalves

Cluster: 8 (22 docs)
generosity, plantp, exploratory, mil, appalachians, 987, gaim, carlo, contradict

In [81]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, use_idf=True,max_df=0.1, max_features=100000, sublinear_tf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
matrix_sample = tfidf_matrix[:1000]
z = linkage(matrix_sample.todense(), metric="cosine", method="complete")
clusters = fcluster(z, t=0.99, criterion="distance")
print_clusters(matrix_sample, clusters)
# seems a lot better

Cluster: 1 (4 docs)
blackout, portfolios, blackouts, cascading, corrective, criticality, finance, budgeting, preventive, disruptions

Cluster: 2 (18 docs)
hybrid, embedded, vlsi, compiler, stanford, scheduling, qca, secure, quantum, computation

Cluster: 3 (6 docs)
mantle, galapagos, antarctic, hotspot, floreana, seismic, broadband, tethyan, plume, indian

Cluster: 4 (4 docs)
paleointensity, deposits, pilcomayo, peopling, forams, foram, alkenone, cores, mesozoic, rio

Cluster: 5 (5 docs)
rivers, hyporheic, braided, hydrologic, anabranching, burges, bedload, flume, jointed, bedrock

Cluster: 6 (12 docs)
rifting, detachment, cordillera, uplift, gps, faulting, basin, continental, slip, paleomagnetic

Cluster: 7 (8 docs)
dm, estimators, random, saddlepoint, mutual, wehrly, tcs, wavelet, descriptive, advisors

Cluster: 8 (14 docs)
algebraic, commutative, homotopy, algebra, geometry, eisenbud, geometric, varieties, cohomology, manifolds

Cluster: 9 (18 docs)
probability, ergodic, harmonic, s

### 1a results

#### Fcluster

* With a min_df of 1 a lot of numbers started popping up and multiple clusters with the same terms
* Higher min_df doesn't do much more than potentially hide "high value" terms
* Mostly good terms with a decent setup
* Small cluster size (# of docs) - related to the t in fcluster
* Method to euclidian instead of complete didn't give much benefit

#### KMeans

* Large clusters
* More numbers in the clusters (Potentially useless, potentially good ie. genes)
* Seems dependant on the random_state
* Higher than 2 min_df just leads to clusters that are too broad

--

In my experimentation I feel like the end-result that was best was the most recent hierarchial clustering. For one, none of the clusters had numbers which I atleast saw as a larger negative.

That said it has it's pros and cons as well. The clusters are considerably smaller in size compared to the KMeans clusters, where these are about 10 or so docs in size, the KMeans clusters seem to be around 25 or so. This is both good and bad in the sense that a smaller cluster most likely means that it's more specific, but it might also mean that it just made multiple clusters that are very similar.

As such I'll go with the fcluster that I have above. It uses


linkage(metric="cosine", method="complete")

fcluster(t=0.99, criterion="distance")

Changing the method only gave very similar or sparse clusters. The t value just made the clusters even smaller, to the point where a doc was basically its own cluster. The min_df and max_df seemed to be pretty optimal at these values, as changing them too much just made clusters too broad or made them have too many "bad" terms.

### 1b label the clusters

Copypaste the cluster just in case since i shuffle the docs at the start of the notebook.

* Cluster: 1 (7 docs) - **Electrical engineering**

multimedia, compiler, smt, hmd, asic, processors, ieee, multiuser, adaptable, fpga



* Cluster: 2 (8 docs) - **Software verification**

hybrid, verification, embedded, software, qos, certification, stanford, rtl, checking, device



* Cluster: 3 (7 docs) - **Continental drifting / Seafloor geography**

continental, rift, rifting, spreading, seafloor, extension, pilcomayo, gulf, deposits, rio



* Cluster: 4 (9 docs) - **Geography statistics**

mantle, antarctic, seismic, gps, geodetic, stations, fault, puget, permanent, recoverable



* Cluster: 5 (10 docs) - **Seismic activity?**

detachment, uplift, floreana, magmatic, tectonic, cordillera, arc, strike, mafic, plateau



* Cluster: 6 (15 docs) - **Thermodynamics**

equations, ergodic, differential, probability, volterra, singularities, hyperbolic, oscillations, boundary, partial



* Cluster: 7 (4 docs) - **Linear algebra**

spaces, operators, teichmueller, functions, operator, metric, hankel, toeplitz, green, holomorphic



* Cluster: 8 (13 docs) - **Algebraic topology**

manifolds, homotopy, dm, geometric, compact, algebras, surfaces, variables, ring, operators



* Cluster: 9 (4 docs) - **Deforestation & poor countries**

migrants, semantic, tenure, real, compositionality, semantics, migration, syntactic, web, deforestation

### 1c pick out 2 good and 2 bad clusters

Clusters 7 & 8 are both good in my opinion.

7 is a little small in size, however the terms are almost all related and for example the 3 names all correspond to functions related to algebra, and obviously functions are also in the picture.

8 is also grouped in a similar way, where the terms can all be related back to topology, where for example homotpoty and manifolds are both main branches of topology.

As for bad clusters, from these 10 I'd say it would be cluster 9 and cluster 3. (5 by extension)

Cluster 9 is simply too hard to interpret. It has a mix of very different terms that are hard to group together. It could be correlated to the Amazon rainforest and the deforestation there but where do semantics come into the picture there.

Cluster 3 in turn isn't that bad, however I feel like its too similar to that of cluster 5. THey're both related to seismic activity, and it's essentially just one being the seafloor, the other being mountains.

## 1d LDA modelling

In [85]:
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

dictionary = corpora.Dictionary(tokenized_text)
lda_corpus = [dictionary.doc2bow(text) for text in tokenized_text]
lda_model = models.LdaModel(lda_corpus, id2word=dictionary, num_topics=10)

In [89]:
# Inspect topics
for i, topic in lda_model.show_topics(num_words=50, formatted=False):
    print("Topic", i)
    printed_terms = 0
    for term, score in topic:
        if printed_terms >= 10:
            break
        elif term in "this will that the of and to for in or The is be may an a with at are on by as from can".split():
            continue
        printed_terms += 1
        print("%.4f\t%s" % (score,term))
    print()

Topic 0
0.0085	research
0.0063	Award
0.0057	Program
0.0056	US
0.0051	Investigator
0.0048	Estimated
0.0045	NSF
0.0044	Research
0.0043	INT
0.0041	This

Topic 1
0.0092	Award
0.0073	Investigator
0.0063	Program
0.0060	NSF
0.0058	Estimated
0.0057	Date
0.0048	current
0.0046	2002
0.0045	Principal
0.0042	research

Topic 2
0.0067	Award
0.0056	AST
0.0055	Date
0.0049	Program
0.0049	NSF
0.0047	Estimated
0.0046	Investigator
0.0036	This
0.0035	2002
0.0033	current

Topic 3
0.0078	Award
0.0063	Investigator
0.0052	NSF
0.0052	Estimated
0.0051	current
0.0050	Program
0.0049	Date
0.0043	Principal
0.0037	2002
0.0031	This

Topic 4
0.0089	Investigator
0.0082	Award
0.0064	Program
0.0064	current
0.0062	NSF
0.0060	students
0.0060	Principal
0.0056	Date
0.0055	Estimated
0.0052	research

Topic 5
0.0108	Investigator
0.0101	Award
0.0092	research
0.0084	NSF
0.0083	University
0.0076	Principal
0.0073	current
0.0073	Program
0.0071	2002
0.0063	Estimated

Topic 6
0.0076	Investigator
0.0058	current
0.0058	NSF
0.0056	Award
0.

In this case a lot of the topics seem to be very similar if not almost identical, however because of how LDA is intended to work this does make some sense.