In [32]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import heapq, numpy as np
import random
#!pip3 install gensim
from gensim import corpora, models
import logging
import zipfile

# Assignment 3

In [4]:
documents = []
dir_path = "awards_2002/"
root_dir = os.fsencode(dir_path)
for directory in os.listdir(root_dir):
    sub_directory = os.fsdecode(directory)
    current_path = dir_path + sub_directory + "/"
    
    for file in os.listdir(dir_path + sub_directory):
        with open(current_path + file, "r", encoding="utf-8", errors="ignore") as f:
            documents.append(f.read())
            
random.shuffle(documents)

## Functions

In [5]:
def feature_names(vectorizer, matrix):
    features = tfidf_vectorizer.get_feature_names()
    for doc_i in range(5):
        print("\nDocument %d, top terms by TF-IDF" % doc_i)
        for term, score in sorted(list(zip(features,matrix.toarray()[doc_i])), key=lambda x:-x[1])[:5]:
            print("%.2f\t%s" % (score, term))

In [6]:
def print_clusters(matrix, clusters, n_keywords=10):
    max_cluster = 10
    for cluster in range(min(clusters), max_cluster):
        cluster_docs = [i for i, c in enumerate(clusters) if c == cluster]
        print("Cluster: %d (%d docs)" % (cluster, len(cluster_docs)))
        
        # Keep scores for top n terms
        new_matrix = np.zeros((len(cluster_docs), matrix.shape[1]))
        for cluster_i, doc_vec in enumerate(matrix[cluster_docs].toarray()):
            for idx, score in heapq.nlargest(n_keywords, enumerate(doc_vec), key=lambda x:x[1]):
                new_matrix[cluster_i][idx] = score

        # Aggregate scores for kept top terms
        keywords = heapq.nlargest(n_keywords, zip(new_matrix.sum(axis=0), features))
        print(', '.join([w for s,w in keywords]))
        print()

## 1a Experiment with KMeans and hierarchial clustering

In [7]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, use_idf=True, sublinear_tf=True, max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
matrix_sample = tfidf_matrix[:1000]
km = KMeans(n_clusters=30, random_state=42, verbose=0)
km.fit(matrix_sample)
print_clusters(matrix_sample, km.labels_)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=30, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [8]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, use_idf=True,max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
matrix_sample = tfidf_matrix[:1000]
z = linkage(matrix_sample.todense(), metric="cosine", method="complete")
clusters = fcluster(z, t=0.99, criterion="distance")
print_clusters(matrix_sample, clusters)

### 1a results

#### Fcluster

* With a min_df of 1 a lot of numbers started popping up and multiple clusters with the same terms
* Higher min_df doesn't do much more than potentially hide "high value" terms
* Mostly good terms with a decent setup
* Small cluster size (# of docs) - related to the t in fcluster
* Method to euclidian instead of complete didn't give much benefit

#### KMeans

* Large clusters
* More numbers in the clusters (Potentially useless, potentially good ie. genes)
* Seems dependant on the random_state
* Higher than 2 min_df just leads to clusters that are too broad

--

In my experimentation I feel like the end-result that was best was the most recent hierarchial clustering. For one, none of the clusters had numbers which I atleast saw as a larger negative.

That said it has it's pros and cons as well. The clusters are considerably smaller in size compared to the KMeans clusters, where these are about 10 or so docs in size, the KMeans clusters seem to be around 25 or so. This is both good and bad in the sense that a smaller cluster most likely means that it's more specific, but it might also mean that it just made multiple clusters that are very similar.

As such I'll go with the fcluster that I have above. It uses


linkage(metric="cosine", method="complete")

fcluster(t=0.99, criterion="distance")

Changing the method only gave very similar or sparse clusters. The t value just made the clusters even smaller, to the point where a doc was basically its own cluster. The min_df and max_df seemed to be pretty optimal at these values, as changing them too much just made clusters too broad or made them have too many "bad" terms.

### 1b label the clusters

Copypaste the cluster just in case since i shuffle the docs at the start of the notebook.

* Cluster: 1 (7 docs) - **Electrical engineering**

multimedia, compiler, smt, hmd, asic, processors, ieee, multiuser, adaptable, fpga



* Cluster: 2 (8 docs) - **Software verification**

hybrid, verification, embedded, software, qos, certification, stanford, rtl, checking, device



* Cluster: 3 (7 docs) - **Continental drifting / Seafloor geography**

continental, rift, rifting, spreading, seafloor, extension, pilcomayo, gulf, deposits, rio



* Cluster: 4 (9 docs) - **Geography statistics**

mantle, antarctic, seismic, gps, geodetic, stations, fault, puget, permanent, recoverable



* Cluster: 5 (10 docs) - **Seismic activity?**

detachment, uplift, floreana, magmatic, tectonic, cordillera, arc, strike, mafic, plateau



* Cluster: 6 (15 docs) - **Thermodynamics**

equations, ergodic, differential, probability, volterra, singularities, hyperbolic, oscillations, boundary, partial



* Cluster: 7 (4 docs) - **Linear algebra**

spaces, operators, teichmueller, functions, operator, metric, hankel, toeplitz, green, holomorphic



* Cluster: 8 (13 docs) - **Algebraic topology**

manifolds, homotopy, dm, geometric, compact, algebras, surfaces, variables, ring, operators



* Cluster: 9 (4 docs) - **Deforestation & poor countries**

migrants, semantic, tenure, real, compositionality, semantics, migration, syntactic, web, deforestation

### 1c pick out 2 good and 2 bad clusters

Clusters 7 & 8 are both good in my opinion.

7 is a little small in size, however the terms are almost all related and for example the 3 names all correspond to functions related to algebra, and obviously functions are also in the picture.

8 is also grouped in a similar way, where the terms can all be related back to topology, where for example homotpoty and manifolds are both main branches of topology.

As for bad clusters, from these 10 I'd say it would be cluster 9 and cluster 3. (5 by extension)

Cluster 9 is simply too hard to interpret. It has a mix of very different terms that are hard to group together. It could be correlated to the Amazon rainforest and the deforestation there but where do semantics come into the picture there.

Cluster 3 in turn isn't that bad, however I feel like its too similar to that of cluster 5. THey're both related to seismic activity, and it's essentially just one being the seafloor, the other being mountains.

## 1d LDA modelling

In [9]:
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

dictionary = corpora.Dictionary(tokenized_text)
lda_corpus = [dictionary.doc2bow(text) for text in tokenized_text]
lda_model = models.LdaModel(lda_corpus, id2word=dictionary, num_topics=10)

In [10]:
# Inspect topics
for i, topic in lda_model.show_topics(num_words=50, formatted=False):
    print("Topic", i)
    printed_terms = 0
    for term, score in topic:
        if printed_terms >= 10:
            break
        elif term in "Award Investigator research this these will that the This of OF and to for in or The is be may an a with at are on by as from can".split():
            continue
        printed_terms += 1
        print("%.4f\t%s" % (score,term))
    print()

Topic 0
0.0031	GLOBEC
0.0028	plume
0.0024	Estimated
0.0022	NSF
0.0021	biological
0.0021	Date
0.0020	Principal
0.0020	OCE
0.0020	Program
0.0019	current

Topic 1
0.0129	Program
0.0125	students
0.0114	NSF
0.0108	Estimated
0.0103	Date
0.0078	program
0.0075	Principal
0.0075	current
0.0068	2002
0.0066	EDUCATION

Topic 2
0.0073	current
0.0070	Principal
0.0068	NSF
0.0063	Program
0.0059	Date
0.0058	Estimated
0.0055	2002
0.0041	Co
0.0040	project
0.0031	Type

Topic 3
0.0064	Program
0.0062	Estimated
0.0057	NSF
0.0056	Date
0.0055	current
0.0053	materials
0.0052	2002
0.0050	Principal
0.0038	Research
0.0033	Expires

Topic 4
0.0021	Date
0.0018	NSF
0.0017	Estimated
0.0015	Program
0.0014	current
0.0013	2002
0.0011	phage
0.0011	Principal
0.0009	Manager
0.0009	Latest

Topic 5
0.0056	Program
0.0055	Estimated
0.0054	Date
0.0052	NSF
0.0046	current
0.0042	2002
0.0039	Principal
0.0033	systems
0.0032	Org
0.0032	data

Topic 6
0.0063	theory
0.0063	DMS
0.0048	MATHEMATICAL
0.0047	Estimated
0.0044	Program
0.0043	Dat

In this case a lot of the topics seem to be very similar if not almost identical, however because of how LDA is intended to work this does make some sense. Since this modelling is designed so that a document can fall under multiple topics.

After removing some stopwords and also removing some terms that occured in every listed topic, you can see that there are some differences between the topics.

## 2. Word vectors

### 2a word2vec

In [11]:
seed_words = ["mathematics", "console", "spring", "technology", "communication"]
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

### Base

Nothing unusual here, I'm not surprised to see that the base settings are decent. Interestingly technology is only 0.77 similar to technologies.

In [13]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size, window, min_count, iter, sg, negative
vectors = models.Word2Vec(tokenized_text)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 18:49:20,322 : INFO : collecting all words and their counts
2020-03-12 18:49:20,323 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 18:49:20,857 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 18:49:20,858 : INFO : Loading a fresh vocabulary
2020-03-12 18:49:21,140 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 18:49:21,141 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 18:49:21,205 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 18:49:21,208 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 18:49:21,209 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 18:49:21,274 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 18:49:21,274 : INFO :

Most similar to:  mathematics
[('discipline', 0.813413679599762), ('science', 0.8133169412612915), ('engineering', 0.8105632066726685), ('literacy', 0.750910758972168), ('majors', 0.7439529895782471), ('physics', 0.741843581199646), ('practice', 0.7403053045272827), ('sciences', 0.7306692004203796), ('concepts', 0.7280646562576294), ('careers', 0.705974817276001)]
Most similar to:  console
[('fluorometer', 0.8892923593521118), ('shake', 0.8729320168495178), ('compass', 0.8702945709228516), ('calorimeter', 0.8649951815605164), ('chirped', 0.851511538028717), ('interferometer', 0.8476803302764893), ('cabinetry', 0.8462677597999573), ('spotting', 0.8419115543365479), ('inkjet', 0.8396214842796326), ('sizing', 0.8387095928192139)]
Most similar to:  spring
[('late', 0.8034616708755493), ('Miocene', 0.7378062605857849), ('1996', 0.7290747165679932), ('fall', 0.7276666760444641), ('1995', 0.6936283707618713), ('Triassic', 0.6931883096694946), ('Sydney', 0.6879227757453918), ('Sept', 0.6851096

### Size = 5, Size = 200

A low size is a huge detriment too the quality of the model. It underfits the model and it then thinks everything is very similar to the given word.

A larger value than the default 100 doesn't necessarily change much. At least in this case. The order of words and such that are similar does change a little, but it does not seem to give any proper quantitative value. Maybe with a larger corpus 

In [14]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, size=5)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 18:50:06,006 : INFO : collecting all words and their counts
2020-03-12 18:50:06,007 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 18:50:06,535 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 18:50:06,536 : INFO : Loading a fresh vocabulary
2020-03-12 18:50:06,862 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 18:50:06,863 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 18:50:06,933 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 18:50:06,935 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 18:50:06,936 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 18:50:06,996 : INFO : estimated required memory for 27041 words and 5 dimensions: 14602140 bytes
2020-03-12 18:50:06,997 : INFO : r

Most similar to:  mathematics
[('science', 0.9933551549911499), ('industrial', 0.9932775497436523), ('enhancing', 0.9929811954498291), ('focusing', 0.9906290173530579), ('economics', 0.9903695583343506), ('promoting', 0.9901710152626038), ('interested', 0.9900872707366943), ('engineering', 0.9888055324554443), ('disciplines', 0.9886555671691895), ('awareness', 0.9861686825752258)]
Most similar to:  console
[('interfering', 0.9994806051254272), ('MSCs', 0.9993323087692261), ('insecticides', 0.9991728067398071), ('gravitationally', 0.9990473985671997), ('Pseudocalanus', 0.9988341331481934), ('disequilibria', 0.998599648475647), ('somatic', 0.9985452890396118), ('3C', 0.9980460405349731), ('dip', 0.9979593753814697), ('Chusang', 0.9974350929260254)]
Most similar to:  spring
[('meridional', 0.9877832531929016), ('Melville', 0.9772300124168396), ('GPa', 0.9688113331794739), ('cam', 0.9660539627075195), ('Luke', 0.964349627494812), ('606042218', 0.9621831178665161), ('Kimball', 0.96031993627

In [23]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, size=200)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:04:25,184 : INFO : collecting all words and their counts
2020-03-12 19:04:25,185 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:04:25,718 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:04:25,719 : INFO : Loading a fresh vocabulary
2020-03-12 19:04:25,794 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:04:25,795 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:04:25,861 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:04:25,863 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:04:25,863 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:04:25,917 : INFO : estimated required memory for 27041 words and 200 dimensions: 56786100 bytes
2020-03-12 19:04:25,918 : INFO :

Most similar to:  mathematics
[('science', 0.8161193132400513), ('engineering', 0.8135734796524048), ('physics', 0.776203989982605), ('literacy', 0.7544336318969727), ('discipline', 0.7528648376464844), ('sciences', 0.7514590620994568), ('nanotechnology', 0.7381583452224731), ('practice', 0.7265830636024475), ('concepts', 0.7165606021881104), ('careers', 0.7163411378860474)]
Most similar to:  console
[('API', 0.8792423009872437), ('OPL', 0.8758563995361328), ('chirped', 0.8735769391059875), ('inkjet', 0.8725205659866333), ('cabinetry', 0.8700251579284668), ('reader', 0.8603600263595581), ('ankle', 0.8571581840515137), ('Switch', 0.8512565493583679), ('compass', 0.8501532077789307), ('addressable', 0.8495756387710571)]
Most similar to:  spring
[('42', 0.7050052285194397), ('Spain', 0.6770830154418945), ('24', 0.6746290922164917), ('1991', 0.6729241013526917), ('1999', 0.6727629899978638), ('13', 0.6706552505493164), ('preceding', 0.6692010760307312), ('Classic', 0.6662352085113525), ('3

### Min_count=2,10

Going too high on min_count simply causes situations where there are no words that are similar.

For words similar to mathematics it does not make a big difference, since they seem to occur so often in this scope of documents. For words similar to spring and console however, you quite quickly lose a lot of words from the list, and it just fills them with new ones. For whatever reason this does increase the similarity compared to base though.

With a high count that is too high, in this case it just doesn't know some words. In my case console doesn't exist with a mincount that is 7 or higher. At this point console and similarities simply feels random.

In [15]:
logging.basicConfig()
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, min_count=2)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 18:54:36,744 : INFO : collecting all words and their counts
2020-03-12 18:54:36,745 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 18:54:37,264 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 18:54:37,265 : INFO : Loading a fresh vocabulary
2020-03-12 18:54:37,619 : INFO : effective_min_count=2 retains 53696 unique words (47% of original 113911, drops 60215)
2020-03-12 18:54:37,620 : INFO : effective_min_count=2 leaves 3621435 word corpus (98% of original 3681650, drops 60215)
2020-03-12 18:54:37,755 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 18:54:37,757 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 18:54:37,758 : INFO : downsampling leaves estimated 2993497 word corpus (82.7% of prior 3621435)
2020-03-12 18:54:37,884 : INFO : estimated required memory for 53696 words and 100 dimensions: 69804800 bytes
2020-03-12 18:54:37,885 : INFO : 

Most similar to:  mathematics
[('science', 0.8420685529708862), ('engineering', 0.8404231071472168), ('sciences', 0.7676929235458374), ('discipline', 0.763458788394928), ('physics', 0.758654773235321), ('practice', 0.7490870356559753), ('concepts', 0.7400397062301636), ('professionals', 0.7394565939903259), ('education', 0.7319020628929138), ('nanoscience', 0.7282558679580688)]
Most similar to:  console
[('peroxide', 0.9277517199516296), ('Immuno', 0.9020063877105713), ('Gallium', 0.9012449979782104), ('bipolar', 0.8988353610038757), ('masonry', 0.896985650062561), ('attenuated', 0.8966995477676392), ('inhibitor', 0.895756185054779), ('CCVD', 0.8930081129074097), ('RNPR2', 0.8893278241157532), ('somites', 0.8876462578773499)]
Most similar to:  spring
[('winter', 0.7589244842529297), ('late', 0.7575393915176392), ('Miocene', 0.7560657262802124), ('Tertiary', 0.7387002110481262), ('northern', 0.7340983152389526), ('north', 0.7309749126434326), ('Holocene', 0.7283297777175903), ('fall', 0

In [28]:
logging.basicConfig()
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, min_count=6)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:38:14,801 : INFO : collecting all words and their counts
2020-03-12 19:38:14,803 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:38:15,322 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:38:15,323 : INFO : Loading a fresh vocabulary
2020-03-12 19:38:15,392 : INFO : effective_min_count=6 retains 24178 unique words (21% of original 113911, drops 89733)
2020-03-12 19:38:15,393 : INFO : effective_min_count=6 leaves 3538274 word corpus (96% of original 3681650, drops 143376)
2020-03-12 19:38:15,447 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:38:15,449 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:38:15,450 : INFO : downsampling leaves estimated 2900621 word corpus (82.0% of prior 3538274)
2020-03-12 19:38:15,493 : INFO : estimated required memory for 24178 words and 100 dimensions: 31431400 bytes
2020-03-12 19:38:15,494 : INFO :

Most similar to:  mathematics
[('science', 0.823967695236206), ('discipline', 0.8136472702026367), ('engineering', 0.7935560941696167), ('physics', 0.7585951089859009), ('sciences', 0.7480071187019348), ('majors', 0.740909993648529), ('literacy', 0.7239643335342407), ('humanities', 0.7178128957748413), ('practice', 0.7135639786720276), ('elementary', 0.7031224966049194)]
Most similar to:  console
[('Least', 0.9102983474731445), ('Squares', 0.9024231433868408), ('inhibitor', 0.9012001752853394), ('cryo', 0.9001132249832153), ('Telluride', 0.8987306356430054), ('osmium', 0.895600438117981), ('pumped', 0.8945826888084412), ('ellipticals', 0.8942266702651978), ('polystyrene', 0.8937935829162598), ('GPR', 0.892507791519165)]
Most similar to:  spring
[('Miocene', 0.7379252910614014), ('late', 0.7371035814285278), ('Tertiary', 0.7284600138664246), ('north', 0.7229325175285339), ('south', 0.7056753635406494), ('winter', 0.7026774287223816), ('Andes', 0.6979003548622131), ('northern', 0.6944183

### Iter = 10

Going higher on the iterations seem to make things more accurate, especially looking at mathematics and technology. The same word in different forms gets higher in similarity compared to the base, when looking at the order instead of the value. So essentially this tells us that the base values were underfitting our data. With too many iterations however you would potentially be looking at iterations, this didn't feel like it started occuring yet.

In [17]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, iter=(10))
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 18:57:31,284 : INFO : collecting all words and their counts
2020-03-12 18:57:31,284 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 18:57:31,798 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 18:57:31,799 : INFO : Loading a fresh vocabulary
2020-03-12 18:57:31,871 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 18:57:31,872 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 18:57:31,935 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 18:57:31,937 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 18:57:31,937 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 18:57:31,991 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 18:57:31,992 : INFO :

Most similar to:  mathematics
[('discipline', 0.6966089010238647), ('mathematical', 0.684969961643219), ('physics', 0.6776703596115112), ('statistics', 0.6553087830543518), ('concepts', 0.6546766757965088), ('majors', 0.6530417203903198), ('engineering', 0.6441001296043396), ('teachers', 0.6436055898666382), ('science', 0.6399111747741699), ('teaching', 0.6306414604187012)]
Most similar to:  console
[('collector', 0.7510401606559753), ('Berger', 0.7244255542755127), ('880', 0.7084409594535828), ('orchestrated', 0.6967594623565674), ('12CO2', 0.6941226720809937), ('chirped', 0.6934460997581482), ('Multibeam', 0.6846871376037598), ('Intercontinental', 0.6845357418060303), ('SOFDI', 0.6831621527671814), ('photodetector', 0.6816697120666504)]
Most similar to:  spring
[('late', 0.672702431678772), ('winter', 0.6662265658378601), ('Miocene', 0.6605401039123535), ('meridional', 0.6553417444229126), ('fall', 0.641124963760376), ('north', 0.6266238689422607), ('Rio', 0.6243113279342651), ('Tert

In [29]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, iter=(15))
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:43:19,409 : INFO : collecting all words and their counts
2020-03-12 19:43:19,412 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:43:19,914 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:43:19,915 : INFO : Loading a fresh vocabulary
2020-03-12 19:43:19,984 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:43:19,984 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:43:20,047 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:43:20,049 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:43:20,050 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:43:20,104 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 19:43:20,104 : INFO :

2020-03-12 19:43:52,248 : INFO : EPOCH 12 - PROGRESS: at 40.09% examples, 1168144 words/s, in_qsize 5, out_qsize 0
2020-03-12 19:43:53,258 : INFO : EPOCH 12 - PROGRESS: at 80.83% examples, 1171156 words/s, in_qsize 5, out_qsize 0
2020-03-12 19:43:53,714 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-12 19:43:53,717 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-03-12 19:43:53,726 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-03-12 19:43:53,727 : INFO : EPOCH - 12 : training on 3681650 raw words (2916083 effective words) took 2.5s, 1175605 effective words/s
2020-03-12 19:43:54,739 : INFO : EPOCH 13 - PROGRESS: at 56.60% examples, 1637437 words/s, in_qsize 4, out_qsize 1
2020-03-12 19:43:55,752 : INFO : EPOCH 13 - PROGRESS: at 97.34% examples, 1404680 words/s, in_qsize 6, out_qsize 0
2020-03-12 19:43:55,800 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-12 19:43:55,807 : INFO : work

Most similar to:  mathematics
[('science', 0.668274998664856), ('mathematical', 0.6588805913925171), ('discipline', 0.6571842432022095), ('majors', 0.6442996263504028), ('teachers', 0.6276435256004333), ('engineering', 0.6237860918045044), ('algebra', 0.6195244193077087), ('teaching', 0.6185547709465027), ('college', 0.6162368059158325), ('literacy', 0.6118821501731873)]
Most similar to:  console
[('airplane', 0.7388802766799927), ('NSA', 0.6568073034286499), ('imager', 0.6563106179237366), ('SSV1', 0.6425600051879883), ('SOFDI', 0.6360782384872437), ('interferometer', 0.6358640193939209), ('FE', 0.635016918182373), ('Intercontinental', 0.6320165395736694), ('ICPMS', 0.6242125034332275), ('Ratio', 0.6241852045059204)]
Most similar to:  spring
[('austral', 0.6717689037322998), ('late', 0.6683459281921387), ('1996', 0.6580739617347717), ('winter', 0.6544943451881409), ('fall', 0.6388822793960571), ('Miocene', 0.6140884160995483), ('meridional', 0.6122885942459106), ('continent', 0.606081

### sg = 1

Changing to skip-gram definitely feels like it helped with regards to 3 of the words here, and their similar words. Console and spring seem to just be an extremely bad fit for this dataset, however for the other 3 words it was quick to decide on very similar words.

In [18]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, sg=1)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:00:20,308 : INFO : collecting all words and their counts
2020-03-12 19:00:20,309 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:00:20,823 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:00:20,824 : INFO : Loading a fresh vocabulary
2020-03-12 19:00:20,899 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:00:20,900 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:00:20,965 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:00:20,967 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:00:20,968 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:00:21,023 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 19:00:21,024 : INFO :

Most similar to:  mathematics
[('algebra', 0.7799115777015686), ('science', 0.7662696838378906), ('majoring', 0.7653417587280273), ('pedagogy', 0.7538089752197266), ('engineering', 0.7537627816200256), ('discipline', 0.752501904964447), ('preservice', 0.7514277696609497), ('humanities', 0.7514068484306335), ('fluency', 0.7431763410568237), ('introductory', 0.7357624173164368)]
Most similar to:  console
[('SOAs', 0.9342267513275146), ('susceptometer', 0.9162594676017761), ('containers', 0.914884626865387), ('reproducibly', 0.913713812828064), ('thicknesses', 0.912643313407898), ('emitter', 0.9124078154563904), ('micrographs', 0.9103856086730957), ('fluorine', 0.909922182559967), ('sends', 0.9097558259963989), ('chromatographs', 0.9091013073921204)]
Most similar to:  spring
[('austral', 0.8643998503684998), ('winter', 0.8076503872871399), ('interglacial', 0.7942143082618713), ('season', 0.7941508889198303), ('brief', 0.788597583770752), ('aboard', 0.7859319448471069), ('driest', 0.784740

### negative = 5, 20

In [19]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, negative=5)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:01:09,689 : INFO : collecting all words and their counts
2020-03-12 19:01:09,690 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:01:10,251 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:01:10,252 : INFO : Loading a fresh vocabulary
2020-03-12 19:01:10,325 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:01:10,326 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:01:10,387 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:01:10,389 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:01:10,390 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:01:10,439 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 19:01:10,440 : INFO :

Most similar to:  mathematics
[('science', 0.8211383819580078), ('engineering', 0.7983667850494385), ('discipline', 0.7597153186798096), ('sciences', 0.7477452754974365), ('humanities', 0.7420483827590942), ('profession', 0.7386701107025146), ('physics', 0.7385774254798889), ('majors', 0.7269306182861328), ('practice', 0.7155213356018066), ('mathematical', 0.7086160182952881)]
Most similar to:  console
[('chirped', 0.8900636434555054), ('spotting', 0.8849547505378723), ('inkjet', 0.8816371560096741), ('copolymerization', 0.8773975968360901), ('cabinetry', 0.8771728277206421), ('778404024', 0.8760333061218262), ('12CO2', 0.873612642288208), ('Oil', 0.8718723058700562), ('ACTION', 0.8714569211006165), ('carcinogen', 0.8702090978622437)]
Most similar to:  spring
[('north', 0.7399226427078247), ('1996', 0.7375333309173584), ('northern', 0.7340042591094971), ('late', 0.7328453660011292), ('Ma', 0.7225845456123352), ('Peru', 0.7179943323135376), ('13', 0.7164722681045532), ('west', 0.7158764

In [24]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, negative=20)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:12:17,238 : INFO : collecting all words and their counts
2020-03-12 19:12:17,242 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:12:17,744 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:12:17,746 : INFO : Loading a fresh vocabulary
2020-03-12 19:12:17,822 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:12:17,823 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:12:17,889 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:12:17,891 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:12:17,892 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:12:17,943 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 19:12:17,944 : INFO :

Most similar to:  mathematics
[('discipline', 0.749215841293335), ('engineering', 0.7461268305778503), ('majors', 0.7420068383216858), ('physics', 0.7044917941093445), ('literacy', 0.6914047002792358), ('concepts', 0.6779178380966187), ('elementary', 0.6689209938049316), ('science', 0.6668782234191895), ('careers', 0.6648756265640259), ('teachers', 0.65972900390625)]
Most similar to:  console
[('chirped', 0.9434792399406433), ('LHP', 0.9293609857559204), ('Talking', 0.92826247215271), ('upwind', 0.9241377115249634), ('compass', 0.9227144122123718), ('SEC', 0.9227045774459839), ('wax', 0.9225384593009949), ('travels', 0.9219479560852051), ('compose', 0.9209423661231995), ('BICEP', 0.9177446365356445)]
Most similar to:  spring
[('late', 0.7833123207092285), ('winter', 0.781053900718689), ('Miocene', 0.7645460963249207), ('Tertiary', 0.7595193386077881), ('downwelling', 0.7564253807067871), ('northwest', 0.7498323321342468), ('rainy', 0.7495518922805786), ('monsoon', 0.7434316873550415), 

### Window = 3, 10

On its own, windows doesn't seem more interesting than the others. 

In [30]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, window=3)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:59:38,814 : INFO : collecting all words and their counts
2020-03-12 19:59:38,816 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:59:39,325 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:59:39,326 : INFO : Loading a fresh vocabulary
2020-03-12 19:59:39,754 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:59:39,755 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:59:39,819 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:59:39,822 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:59:39,822 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:59:39,872 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 19:59:39,873 : INFO :

Most similar to:  mathematics
[('engineering', 0.8175453543663025), ('discipline', 0.8121922016143799), ('science', 0.7867183089256287), ('statistics', 0.7689366340637207), ('literacy', 0.748267650604248), ('physics', 0.745501697063446), ('nanotechnology', 0.744453489780426), ('practice', 0.7390854954719543), ('concepts', 0.7390586137771606), ('bioinformatics', 0.7379647493362427)]
Most similar to:  console
[('Tribolium', 0.9003188014030457), ('ns1', 0.8933048844337463), ('isomerase', 0.8924181461334229), ('stepping', 0.8923943042755127), ('tethering', 0.8914327621459961), ('tendencies', 0.8896902799606323), ('neon', 0.8886416554450989), ('dynein', 0.888285756111145), ('CF', 0.8881625533103943), ('Junctions', 0.8881528973579407)]
Most similar to:  spring
[('Spain', 0.8236149549484253), ('late', 0.7788383960723877), ('Seville', 0.7751665711402893), ('winter', 0.7727768421173096), ('1995', 0.7695266008377075), ('austral', 0.7557742595672607), ('Classic', 0.7542219161987305), ('civilizati

In [31]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, window=10)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 19:59:54,873 : INFO : collecting all words and their counts
2020-03-12 19:59:54,874 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 19:59:55,401 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 19:59:55,402 : INFO : Loading a fresh vocabulary
2020-03-12 19:59:55,476 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 19:59:55,476 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 19:59:55,538 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 19:59:55,541 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 19:59:55,541 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 19:59:55,591 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 19:59:55,591 : INFO :

Most similar to:  mathematics
[('science', 0.8216568231582642), ('engineering', 0.7666447162628174), ('majors', 0.7465415596961975), ('physics', 0.7336078882217407), ('elementary', 0.7253873348236084), ('discipline', 0.7172554731369019), ('sciences', 0.7157300114631653), ('mathematical', 0.709143877029419), ('teachers', 0.7050789594650269), ('mathematicians', 0.7021793127059937)]
Most similar to:  console
[('optimizes', 0.8795520067214966), ('chirped', 0.8729230761528015), ('Patten', 0.8725773692131042), ('Andreev', 0.8694102168083191), ('02140', 0.8669188022613525), ('Reforming', 0.8665715456008911), ('ns1', 0.8617780208587646), ('bombardment', 0.8598340749740601), ('295', 0.859593391418457), ('Double', 0.8564720153808594)]
Most similar to:  spring
[('winter', 0.7506990432739258), ('late', 0.7324632406234741), ('1996', 0.710605263710022), ('season', 0.6982637047767639), ('period', 0.6915218830108643), ('Southeast', 0.690123438835144), ('cruises', 0.6875318884849548), ('southwestern', 

# Results for 2a.

First off; spring was apparently a terrible word to choose since there is nothing that seems to be close to it, winter does pop up occasionally but for all settings it feels kind of random what it picks to be similar.

For me the most interesting variables is the skipgram. Incresing iterations in this scenario also felt valuable. 

### Comparing 2002 vs full abstracts

In [None]:
documents = []
dir_path = "awards_2002/"
root_dir = os.fsencode(dir_path)
for directory in os.listdir(root_dir):
    sub_directory = os.fsdecode(directory)
    current_path = dir_path + sub_directory + "/"
    
    for file in os.listdir(dir_path + sub_directory):
        with open(current_path + file, "r", encoding="utf-8", errors="ignore") as f:
            documents.append(f.read())
            
random.shuffle(documents)

In [36]:
def extract_abstracts()
with zipfile.ZipFile("abstracts.zip", "r") as file:
    file.extractall()
    print("done")

done


### 2b. elmo

In [65]:
#!pip install tensorflow==1.15
#!pip install "tensorflow_hub>=0.6.0"
#!pip3 install tensorflow_text==1.15

import tensorflow as tf
import tensorflow_hub as hub

elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3",signature="default", as_dict=True trainable=True)

Collecting tensorflow==1.15
  Using cached https://files.pythonhosted.org/packages/92/2b/e3af15221da9ff323521565fa3324b0d7c7c5b1d7a8ca66984c8d59cb0ce/tensorflow-1.15.0-cp37-cp37m-manylinux2010_x86_64.whl
Collecting gast==0.2.2 (from tensorflow==1.15)
Collecting tensorboard<1.16.0,>=1.15.0 (from tensorflow==1.15)
  Using cached https://files.pythonhosted.org/packages/1e/e9/d3d747a97f7188f48aa5eda486907f3b345cd409f0a0850468ba867db246/tensorboard-1.15.0-py3-none-any.whl
Collecting tensorflow-estimator==1.15.1 (from tensorflow==1.15)
  Using cached https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl
Installing collected packages: gast, tensorboard, tensorflow-estimator, tensorflow
  Found existing installation: gast 0.3.1
    Uninstalling gast-0.3.1:
[31mERROR: Could not install packages due to an EnvironmentError: [Errno 13] Permission denied: 'INSTALLER'
Consider using the `--user` op

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [61]:
def elmo_vectors(sents):
    embeddings = elmo(sents, )["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        return sess.run(embeddings)

In [66]:
def elmo_vectors(sents):
    embeddings = elmo(sents, signature="default", as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        return sess.run(embeddings)
        #sess.run(tf.tables_initializer())
        # return average of ELMo features as sentence vector
        #return sess.run(tf.reduce_mean(embeddings,1))

In [67]:
sents = """The game ended quickly .
He hunted some game for dinner .
A game of swans in the river .*
They played a game of chess .
They were in a baseball game .
She decided to eat som game .
Game can be found in forests .
Counterstrike is a popular game .
They didn't follow the game .
It was time to game .""".split('\n')

target = "game"

elmo_vecs = elmo_vectors(sents)
word_vecs = []
for i, sent in enumerate(sents):
    word_vecs.append(elmo_vecs[i][sent.split().index(target)])
    print("Sentence: ", sent)
    print("Vector for '%s:'" % target, word_vecs[-1])
    print()
    
print("Word vec size", word_vecs[0].shape)

TypeError: call() got an unexpected keyword argument 'signature'

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

vec_size = word_vecs[0].shape[0]
print("Similarities between '%s' vector in sentences:" % target)
for i in range(1, len(sents)):
    print("Sent 0-%d:" % i, cosine_similarity(word_vecs[0].reshape((1,vec_size)), 
                                              word_vecs[i].reshape((1,vec_size)))[0][0])

NameError: name 'word_vecs' is not defined