In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import heapq, numpy as np
import random
#!pip3 install gensim
from gensim import corpora, models
import logging
import zipfile

# Assignment 3

In [2]:
documents = []
dir_path = "awards_2002/"
root_dir = os.fsencode(dir_path)
for directory in os.listdir(root_dir):
    sub_directory = os.fsdecode(directory)
    current_path = dir_path + sub_directory + "/"
    
    for file in os.listdir(dir_path + sub_directory):
        with open(current_path + file, "r", encoding="utf-8", errors="ignore") as f:
            documents.append(f.read())
            
random.shuffle(documents)

## Functions

In [3]:
def feature_names(vectorizer, matrix):
    features = tfidf_vectorizer.get_feature_names()
    for doc_i in range(5):
        print("\nDocument %d, top terms by TF-IDF" % doc_i)
        for term, score in sorted(list(zip(features,matrix.toarray()[doc_i])), key=lambda x:-x[1])[:5]:
            print("%.2f\t%s" % (score, term))

In [4]:
def print_clusters(matrix, clusters, n_keywords=10):
    max_cluster = 10
    for cluster in range(min(clusters), max_cluster):
        cluster_docs = [i for i, c in enumerate(clusters) if c == cluster]
        print("Cluster: %d (%d docs)" % (cluster, len(cluster_docs)))
        
        # Keep scores for top n terms
        new_matrix = np.zeros((len(cluster_docs), matrix.shape[1]))
        for cluster_i, doc_vec in enumerate(matrix[cluster_docs].toarray()):
            for idx, score in heapq.nlargest(n_keywords, enumerate(doc_vec), key=lambda x:x[1]):
                new_matrix[cluster_i][idx] = score

        # Aggregate scores for kept top terms
        keywords = heapq.nlargest(n_keywords, zip(new_matrix.sum(axis=0), features))
        print(', '.join([w for s,w in keywords]))
        print()

## 1a Experiment with KMeans and hierarchial clustering

In [5]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, use_idf=True, sublinear_tf=True, max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
features = tfidf_vectorizer.get_feature_names()
matrix_sample = tfidf_matrix[:1000]
km = KMeans(n_clusters=30, random_state=42, verbose=0)
km.fit(matrix_sample)
print_clusters(matrix_sample, km.labels_)

Cluster: 0 (61 docs)
genes, sexual, drosophila, reproductive, arabidopsis, signaling, gene, brain, pathways, stress

Cluster: 1 (36 docs)
cts, thermal, micro, plasma, liquid, tip, discharge, sapphire, transport, nanoscale

Cluster: 2 (50 docs)
arctic, co2, nitrogen, ice, nutrient, sea, ecosystem, variability, lakes, holocene

Cluster: 3 (26 docs)
scholars, 1536, scholarship, women, csems, csem, mathematics, scholarships, coe, sbc

Cluster: 4 (32 docs)
manufacturing, product, machining, video, task, movement, miniature, aluminum, powder, maintenance

Cluster: 5 (15 docs)
topology, manifolds, algebraic, symplectic, proposer, topological, manifold, quantization, denton, bpa

Cluster: 6 (60 docs)
teachers, school, teacher, esi, technicians, curriculum, rec, middle, districts, eec

Cluster: 7 (42 docs)
eia, tree, michigan, seeds, digital, webber, mammals, phylogeny, ecological, seed

Cluster: 8 (48 docs)
protein, cell, powder, ii, display, coating, porphyrin, charge, ray, circuits

Cluster:

In [6]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, use_idf=True,max_df=0.1, max_features=100000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
features = tfidf_vectorizer.get_feature_names()
matrix_sample = tfidf_matrix[:1000]
z = linkage(matrix_sample.todense(), metric="cosine", method="complete")
clusters = fcluster(z, t=0.99, criterion="distance")
print_clusters(matrix_sample, clusters)

Cluster: 1 (5 docs)
meeting, vigre, steroids, sensory, abs, travel, nervous, integrative, archbold, comparative

Cluster: 2 (5 docs)
symposium, pachavis, shall, self, cardiovascular, nanoengineered, boston, heart, 20th, mrs

Cluster: 3 (5 docs)
ozone, oh, atmospheric, hynes, ho2, o3, buoys, mesosphere, tropospheric, springtime

Cluster: 4 (4 docs)
nitrogen, meliloti, isotope, herbivores, symbiosis, cycling, frozen, herbivore, grasslands, ucsb

Cluster: 5 (6 docs)
terrorist, attacks, americans, attack, wtc, terrorism, events, agenda, identity, outcomes

Cluster: 6 (7 docs)
co2, respiration, ecosystem, land, tundra, africa, desert, latitudinal, soil, elevated

Cluster: 7 (6 docs)
birds, mammals, variation, coat, plumage, genetically, endocrine, avian, craniodental, evolutionary

Cluster: 8 (4 docs)
wisconsin, madison, complexity, inference, conditional, worst, case, hardness, moment, vertebrate

Cluster: 9 (6 docs)
arctic, ice, nitrogen, bering, pack, iasc, basin, amerasian, opp, yanling

### 1a results

#### Fcluster

* With a min_df of 1 a lot of numbers started popping up and multiple clusters with the same terms
* Higher min_df doesn't do much more than potentially hide "high value" terms
* Mostly good terms with a decent setup
* Small cluster size (# of docs) - related to the t in fcluster
* Method to euclidian instead of complete didn't give much benefit

#### KMeans

* Large clusters
* More numbers in the clusters (Potentially useless, potentially good ie. genes)
* Seems dependant on the random_state
* Higher than 2 min_df just leads to clusters that are too broad

--

In my experimentation I feel like the end-result that was best was the most recent hierarchial clustering. For one, none of the clusters had numbers which I atleast saw as a larger negative.

That said it has it's pros and cons as well. The clusters are considerably smaller in size compared to the KMeans clusters, where these are about 10 or so docs in size, the KMeans clusters seem to be around 25 or so. This is both good and bad in the sense that a smaller cluster most likely means that it's more specific, but it might also mean that it just made multiple clusters that are very similar.

As such I'll go with the fcluster that I have above. It uses


linkage(metric="cosine", method="complete")

fcluster(t=0.99, criterion="distance")

Changing the method only gave very similar or sparse clusters. The t value just made the clusters even smaller, to the point where a doc was basically its own cluster. The min_df and max_df seemed to be pretty optimal at these values, as changing them too much just made clusters too broad or made them have too many "bad" terms.

### 1b label the clusters

Copypaste the cluster just in case since i shuffle the docs at the start of the notebook.

* Cluster: 1 (7 docs) - **Electrical engineering**

multimedia, compiler, smt, hmd, asic, processors, ieee, multiuser, adaptable, fpga



* Cluster: 2 (8 docs) - **Software verification**

hybrid, verification, embedded, software, qos, certification, stanford, rtl, checking, device



* Cluster: 3 (7 docs) - **Continental drifting / Seafloor geography**

continental, rift, rifting, spreading, seafloor, extension, pilcomayo, gulf, deposits, rio



* Cluster: 4 (9 docs) - **Geography statistics**

mantle, antarctic, seismic, gps, geodetic, stations, fault, puget, permanent, recoverable



* Cluster: 5 (10 docs) - **Seismic activity?**

detachment, uplift, floreana, magmatic, tectonic, cordillera, arc, strike, mafic, plateau



* Cluster: 6 (15 docs) - **Thermodynamics**

equations, ergodic, differential, probability, volterra, singularities, hyperbolic, oscillations, boundary, partial



* Cluster: 7 (4 docs) - **Linear algebra**

spaces, operators, teichmueller, functions, operator, metric, hankel, toeplitz, green, holomorphic



* Cluster: 8 (13 docs) - **Algebraic topology**

manifolds, homotopy, dm, geometric, compact, algebras, surfaces, variables, ring, operators



* Cluster: 9 (4 docs) - **Deforestation & poor countries**

migrants, semantic, tenure, real, compositionality, semantics, migration, syntactic, web, deforestation

### 1c pick out 2 good and 2 bad clusters

Clusters 7 & 8 are both good in my opinion.

7 is a little small in size, however the terms are almost all related and for example the 3 names all correspond to functions related to algebra, and obviously functions are also in the picture.

8 is also grouped in a similar way, where the terms can all be related back to topology, where for example homotpoty and manifolds are both main branches of topology.

As for bad clusters, from these 10 I'd say it would be cluster 9 and cluster 3. (5 by extension)

Cluster 9 is simply too hard to interpret. It has a mix of very different terms that are hard to group together. It could be correlated to the Amazon rainforest and the deforestation there but where do semantics come into the picture there.

Cluster 3 in turn isn't that bad, however I feel like its too similar to that of cluster 5. THey're both related to seismic activity, and it's essentially just one being the seafloor, the other being mountains.

## 1d LDA modelling

In [7]:
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

dictionary = corpora.Dictionary(tokenized_text)
lda_corpus = [dictionary.doc2bow(text) for text in tokenized_text]
lda_model = models.LdaModel(lda_corpus, id2word=dictionary, num_topics=10)

In [8]:
# Inspect topics
for i, topic in lda_model.show_topics(num_words=50, formatted=False):
    print("Topic", i)
    printed_terms = 0
    for term, score in topic:
        if printed_terms >= 10:
            break
        elif term in "Award Investigator research this these will that the This of OF and to for in or The is be may an a with at are on by as from can".split():
            continue
        printed_terms += 1
        print("%.4f\t%s" % (score,term))
    print()

Topic 0
0.0081	CHE
0.0061	Chemistry
0.0057	Program
0.0054	University
0.0052	Date
0.0052	NSF
0.0049	Estimated
0.0042	current
0.0038	Principal
0.0037	CHEMISTRY

Topic 1
0.0073	Date
0.0065	Program
0.0065	NSF
0.0065	Estimated
0.0062	Principal
0.0060	current
0.0056	2002
0.0039	data
0.0036	Title
0.0036	Applictn

Topic 2
0.0094	NSF
0.0085	current
0.0085	Program
0.0079	Principal
0.0075	Date
0.0074	Estimated
0.0071	students
0.0069	2002
0.0059	University
0.0042	Prgm

Topic 3
0.0047	NSF
0.0046	current
0.0045	Date
0.0044	Estimated
0.0041	Program
0.0039	Principal
0.0033	project
0.0033	2002
0.0025	science
0.0025	have

Topic 4
0.0058	NSF
0.0057	Program
0.0053	Estimated
0.0052	Date
0.0043	current
0.0039	2002
0.0038	Principal
0.0030	Sponsor
0.0029	File
0.0028	Ref

Topic 5
0.0082	CSEMS
0.0074	students
0.0063	1536
0.0053	DUE
0.0044	scholarships
0.0043	CS
0.0041	Computer
0.0039	program
0.0037	MATH
0.0037	scholarship

Topic 6
0.0070	NSF
0.0069	Program
0.0065	Estimated
0.0063	Date
0.0058	2002
0.0053	current

In this case a lot of the topics seem to be very similar if not almost identical, however because of how LDA is intended to work this does make some sense. Since this modelling is designed so that a document can fall under multiple topics.

After removing some stopwords and also removing some terms that occured in every listed topic, you can see that there are some differences between the topics.

## 2. Word vectors

### 2a word2vec

In [9]:
seed_words = ["mathematics", "console", "spring", "technology", "communication"]
tfidf2_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in documents]

### Base

Nothing unusual here, I'm not surprised to see that the base settings are decent. Interestingly technology is only 0.77 similar to technologies.

In [10]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size, window, min_count, iter, sg, negative
vectors = models.Word2Vec(tokenized_text)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:44:47,691 : INFO : collecting all words and their counts
2020-03-12 21:44:47,694 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:44:48,305 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:44:48,306 : INFO : Loading a fresh vocabulary
2020-03-12 21:44:48,395 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:44:48,396 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:44:48,470 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:44:48,474 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:44:48,475 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:44:48,537 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:44:48,538 : INFO :

Most similar to:  mathematics
[('engineering', 0.813104510307312), ('science', 0.8105961084365845), ('discipline', 0.7897181510925293), ('physics', 0.7478721141815186), ('sciences', 0.7468525767326355), ('practice', 0.7271543145179749), ('disciplines', 0.7233316898345947), ('majors', 0.7194168567657471), ('physicists', 0.7163714170455933), ('concepts', 0.7058789134025574)]
Most similar to:  console
[('upwind', 0.9170166850090027), ('seat', 0.8907194137573242), ('chirped', 0.8875600099563599), ('ROV', 0.8860045671463013), ('alternately', 0.8854444622993469), ('reproducibly', 0.8848638534545898), ('gabbro', 0.8848577737808228), ('HCl', 0.8817570805549622), ('rectangular', 0.8816057443618774), ('GDP', 0.88057541847229)]
Most similar to:  spring
[('Pretoria', 0.7438228130340576), ('meridional', 0.7216531038284302), ('Sept', 0.7069242000579834), ('Toronto', 0.6955479383468628), ('03', 0.6943494081497192), ('Oct', 0.6931256055831909), ('23', 0.6827120780944824), ('late', 0.6820248365402222),

### Size = 5, Size = 200

A low size is a huge detriment too the quality of the model. It underfits the model and it then thinks everything is very similar to the given word.

A larger value than the default 100 doesn't necessarily change much. At least in this case. The order of words and such that are similar does change a little, but it does not seem to give any proper quantitative value. Maybe with a larger corpus 

In [11]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, size=5)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:45:03,715 : INFO : collecting all words and their counts
2020-03-12 21:45:03,718 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:45:04,255 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:45:04,256 : INFO : Loading a fresh vocabulary
2020-03-12 21:45:04,575 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:45:04,576 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:45:04,650 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:45:04,653 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:45:04,654 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:45:04,712 : INFO : estimated required memory for 27041 words and 5 dimensions: 14602140 bytes
2020-03-12 21:45:04,713 : INFO : r

Most similar to:  mathematics
[('degree', 0.9968880414962769), ('awareness', 0.9961378574371338), ('science', 0.9955247640609741), ('disciplines', 0.9952576160430908), ('emphasis', 0.9933594465255737), ('public', 0.9914232492446899), ('industrial', 0.9895184636116028), ('literacy', 0.9889904260635376), ('recruitment', 0.9889792203903198), ('expose', 0.9889682531356812)]
Most similar to:  console
[('inflexible', 0.9982613325119019), ('EBSD', 0.9980884790420532), ('reflectors', 0.9975018501281738), ('tional', 0.9973971247673035), ('HCl', 0.9973499178886414), ('duplex', 0.9966325759887695), ('dbi', 0.9961663484573364), ('SAR', 0.9958171248435974), ('aptamers', 0.995576798915863), ('1664', 0.9949343204498291)]
Most similar to:  spring
[('yr', 0.9788516163825989), ('40', 0.9675351977348328), ('percent', 0.964647650718689), ('Koenker', 0.962959885597229), ('hostplant', 0.9621850848197937), ('Cretaceous', 0.9587557315826416), ('late', 0.9558874368667603), ('Miocene', 0.9533337950706482), ('Su

In [12]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, size=200)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:45:17,423 : INFO : collecting all words and their counts
2020-03-12 21:45:17,424 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:45:17,966 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:45:17,967 : INFO : Loading a fresh vocabulary
2020-03-12 21:45:18,051 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:45:18,051 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:45:18,117 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:45:18,120 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:45:18,120 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:45:18,178 : INFO : estimated required memory for 27041 words and 200 dimensions: 56786100 bytes
2020-03-12 21:45:18,179 : INFO :

Most similar to:  mathematics
[('engineering', 0.8135855793952942), ('science', 0.7961062788963318), ('discipline', 0.7698987722396851), ('physics', 0.7622895240783691), ('sciences', 0.7398695945739746), ('majors', 0.7313045263290405), ('practice', 0.7287845611572266), ('mathematicians', 0.7127530574798584), ('concepts', 0.7096660137176514), ('literacy', 0.7071012258529663)]
Most similar to:  console
[('phot1', 0.8887844085693359), ('metaphase', 0.8727130889892578), ('Moon', 0.8691428899765015), ('effluent', 0.8685858845710754), ('quenched', 0.8675808906555176), ('sizing', 0.8673232793807983), ('scFISH', 0.8638952970504761), ('NbTi', 0.8633050918579102), ('label', 0.8616794943809509), ('leech', 0.8604432940483093)]
Most similar to:  spring
[('late', 0.8091793060302734), ('winter', 0.7742407321929932), ('north', 0.7732067108154297), ('Miocene', 0.7693005800247192), ('Tertiary', 0.7668789625167847), ('Peru', 0.750515341758728), ('Located', 0.7456796765327454), ('meridional', 0.7448593378

### Min_count=2,10

Going too high on min_count simply causes situations where there are no words that are similar.

For words similar to mathematics it does not make a big difference, since they seem to occur so often in this scope of documents. For words similar to spring and console however, you quite quickly lose a lot of words from the list, and it just fills them with new ones. For whatever reason this does increase the similarity compared to base though.

With a high count that is too high, in this case it just doesn't know some words. In my case console doesn't exist with a mincount that is 7 or higher. At this point console and similarities simply feels random.

In [13]:
logging.basicConfig()
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, min_count=2)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:45:38,691 : INFO : collecting all words and their counts
2020-03-12 21:45:38,693 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:45:39,234 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:45:39,235 : INFO : Loading a fresh vocabulary
2020-03-12 21:45:39,582 : INFO : effective_min_count=2 retains 53696 unique words (47% of original 113911, drops 60215)
2020-03-12 21:45:39,583 : INFO : effective_min_count=2 leaves 3621435 word corpus (98% of original 3681650, drops 60215)
2020-03-12 21:45:39,714 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:45:39,716 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:45:39,717 : INFO : downsampling leaves estimated 2993497 word corpus (82.7% of prior 3621435)
2020-03-12 21:45:39,837 : INFO : estimated required memory for 53696 words and 100 dimensions: 69804800 bytes
2020-03-12 21:45:39,838 : INFO : 

Most similar to:  mathematics
[('science', 0.8264923691749573), ('engineering', 0.8224689960479736), ('discipline', 0.7762817144393921), ('sciences', 0.7714868783950806), ('physics', 0.7629547119140625), ('practice', 0.7457550764083862), ('astronomy', 0.7379782199859619), ('majors', 0.72240149974823), ('literacy', 0.7207099795341492), ('careers', 0.7131826877593994)]
Most similar to:  console
[('disjunction', 0.8920546770095825), ('MIKE', 0.8914703130722046), ('slowing', 0.89117830991745), ('cabinetry', 0.8909517526626587), ('chirped', 0.8897838592529297), ('cloacal', 0.88832026720047), ('remnant', 0.886172354221344), ('Willy', 0.8849791288375854), ('bivalve', 0.8842459917068481), ('telechelic', 0.8838270306587219)]
Most similar to:  spring
[('late', 0.7426997423171997), ('Lahore', 0.73345947265625), ('Miocene', 0.7269890308380127), ('fall', 0.7182335257530212), ('winter', 0.7125062942504883), ('north', 0.7117009162902832), ('1996', 0.7115656733512878), ('Peru', 0.7019992470741272), ('

In [14]:
logging.basicConfig()
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, min_count=6)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:46:00,849 : INFO : collecting all words and their counts
2020-03-12 21:46:00,850 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:46:01,410 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:46:01,411 : INFO : Loading a fresh vocabulary
2020-03-12 21:46:01,729 : INFO : effective_min_count=6 retains 24178 unique words (21% of original 113911, drops 89733)
2020-03-12 21:46:01,729 : INFO : effective_min_count=6 leaves 3538274 word corpus (96% of original 3681650, drops 143376)
2020-03-12 21:46:01,797 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:46:01,800 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:46:01,801 : INFO : downsampling leaves estimated 2900621 word corpus (82.0% of prior 3538274)
2020-03-12 21:46:01,858 : INFO : estimated required memory for 24178 words and 100 dimensions: 31431400 bytes
2020-03-12 21:46:01,859 : INFO :

Most similar to:  mathematics
[('engineering', 0.7995164394378662), ('science', 0.7989711761474609), ('discipline', 0.758630633354187), ('practice', 0.7436290383338928), ('sciences', 0.7373279929161072), ('physics', 0.7358354330062866), ('majors', 0.7308528423309326), ('concepts', 0.7071372866630554), ('bioengineering', 0.7014557123184204), ('nanotechnology', 0.699437141418457)]
Most similar to:  console
[('apartment', 0.9072535037994385), ('fluorometer', 0.8948256969451904), ('Lynntech', 0.888268232345581), ('microanalysis', 0.8870493173599243), ('Soboyejo', 0.8870485424995422), ('calibrating', 0.8837348222732544), ('USER', 0.8793085813522339), ('translating', 0.8791844844818115), ('DISSEMINATION', 0.8790374994277954), ('Multibeam', 0.8785663843154907)]
Most similar to:  spring
[('late', 0.7465955018997192), ('coast', 0.6969091892242432), ('Spain', 0.6968727707862854), ('Peru', 0.6955786347389221), ('summers', 0.6930745840072632), ('season', 0.689520537853241), ('24', 0.68950229883193

### Iter = 10

Going higher on the iterations seem to make things more accurate, especially looking at mathematics and technology. The same word in different forms gets higher in similarity compared to the base, when looking at the order instead of the value. So essentially this tells us that the base values were underfitting our data. With too many iterations however you would potentially be looking at iterations, this didn't feel like it started occuring yet.

In [15]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, iter=(10))
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:46:17,851 : INFO : collecting all words and their counts
2020-03-12 21:46:17,852 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:46:18,426 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:46:18,428 : INFO : Loading a fresh vocabulary
2020-03-12 21:46:18,737 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:46:18,738 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:46:18,804 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:46:18,807 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:46:18,807 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:46:18,866 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:46:18,866 : INFO :

Most similar to:  mathematics
[('discipline', 0.6984312534332275), ('majors', 0.6910269856452942), ('physics', 0.675421953201294), ('mathematical', 0.6634280681610107), ('science', 0.6609295010566711), ('engineering', 0.6540749669075012), ('statistics', 0.639667809009552), ('college', 0.633525013923645), ('teachers', 0.6284337043762207), ('algebra', 0.6130471229553223)]
Most similar to:  console
[('chirped', 0.7431447505950928), ('melter', 0.7244747877120972), ('alternately', 0.7210508584976196), ('McCorriston', 0.7093610167503357), ('compass', 0.7071682214736938), ('carcinogen', 0.7053254842758179), ('compose', 0.7018460035324097), ('electrospray', 0.7004756927490234), ('collector', 0.6969977617263794), ('interferometer', 0.696153998374939)]
Most similar to:  spring
[('late', 0.7416752576828003), ('Miocene', 0.7361701726913452), ('winter', 0.6955665946006775), ('rainy', 0.6827518939971924), ('outbreak', 0.6766948103904724), ('Holocene', 0.6724553108215332), ('1996', 0.6696754693984985

In [16]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, iter=(15))
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:46:47,459 : INFO : collecting all words and their counts
2020-03-12 21:46:47,460 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:46:47,988 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:46:47,988 : INFO : Loading a fresh vocabulary
2020-03-12 21:46:48,066 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:46:48,066 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:46:48,140 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:46:48,143 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:46:48,143 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:46:48,200 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:46:48,200 : INFO :

2020-03-12 21:47:19,682 : INFO : EPOCH 12 - PROGRESS: at 44.17% examples, 1272334 words/s, in_qsize 5, out_qsize 0
2020-03-12 21:47:20,692 : INFO : EPOCH 12 - PROGRESS: at 85.47% examples, 1230910 words/s, in_qsize 5, out_qsize 0
2020-03-12 21:47:21,030 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-12 21:47:21,038 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-03-12 21:47:21,043 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-03-12 21:47:21,044 : INFO : EPOCH - 12 : training on 3681650 raw words (2916749 effective words) took 2.4s, 1227907 effective words/s
2020-03-12 21:47:22,052 : INFO : EPOCH 13 - PROGRESS: at 39.88% examples, 1163038 words/s, in_qsize 5, out_qsize 0
2020-03-12 21:47:23,069 : INFO : EPOCH 13 - PROGRESS: at 79.15% examples, 1142020 words/s, in_qsize 5, out_qsize 0
2020-03-12 21:47:23,589 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-12 21:47:23,597 : INFO : work

Most similar to:  mathematics
[('discipline', 0.6427435874938965), ('mathematical', 0.6408028602600098), ('majors', 0.6399889588356018), ('teaching', 0.6306318044662476), ('engineering', 0.6289055347442627), ('physics', 0.6157816648483276), ('science', 0.6121276617050171), ('teachers', 0.6106493473052979), ('algebra', 0.5951567888259888), ('college', 0.5874304175376892)]
Most similar to:  console
[('airplane', 0.6556798815727234), ('collector', 0.6512964963912964), ('NSA', 0.6271592378616333), ('LC', 0.6028128862380981), ('inhibitor', 0.6010703444480896), ('elliptical', 0.5919283628463745), ('880', 0.5847868323326111), ('chirped', 0.5824361443519592), ('saltwater', 0.5803298950195312), ('kN', 0.5766463279724121)]
Most similar to:  spring
[('winter', 0.6461237072944641), ('dives', 0.64360511302948), ('austral', 0.6333377361297607), ('aboard', 0.6298508048057556), ('Located', 0.6257637739181519), ('Peninsula', 0.6229163408279419), ('fall', 0.6150477528572083), ('Sonoran', 0.6133734583854

### sg = 1

Changing to skip-gram definitely feels like it helped with regards to 3 of the words here, and their similar words. Console and spring seem to just be an extremely bad fit for this dataset, however for the other 3 words it was quick to decide on very similar words.

In [17]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, sg=1)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:47:28,006 : INFO : collecting all words and their counts
2020-03-12 21:47:28,007 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:47:28,535 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:47:28,536 : INFO : Loading a fresh vocabulary
2020-03-12 21:47:28,831 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:47:28,832 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:47:28,910 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:47:28,912 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:47:28,913 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:47:28,969 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:47:28,970 : INFO :

Most similar to:  mathematics
[('literacy', 0.7666188478469849), ('algebra', 0.7601535320281982), ('science', 0.7597874402999878), ('majoring', 0.7580490708351135), ('pedagogy', 0.7315536737442017), ('humanities', 0.7310315370559692), ('discipline', 0.7301062941551208), ('profession', 0.7285811901092529), ('introductory', 0.7223597764968872), ('collegiate', 0.7176905274391174)]
Most similar to:  console
[('attachments', 0.9332625269889832), ('interferometers', 0.9270851612091064), ('EBSD', 0.9261435270309448), ('melter', 0.9259473085403442), ('FFS', 0.9237853288650513), ('installing', 0.9235818982124329), ('ankle', 0.9231774210929871), ('PSA', 0.9216408729553223), ('CL', 0.9180670976638794), ('flex', 0.9177895784378052)]
Most similar to:  spring
[('austral', 0.8871985673904419), ('winter', 0.8050752878189087), ('dives', 0.7881208658218384), ('interglacial', 0.7875773906707764), ('season', 0.7783920764923096), ('BATS', 0.7748808264732361), ('downwelling', 0.7726856470108032), ('brief', 

### negative = 5, 20

In [18]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, negative=5)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:48:18,141 : INFO : collecting all words and their counts
2020-03-12 21:48:18,142 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:48:18,681 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:48:18,682 : INFO : Loading a fresh vocabulary
2020-03-12 21:48:18,765 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:48:18,766 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:48:18,835 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:48:18,838 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:48:18,839 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:48:18,902 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:48:18,903 : INFO :

Most similar to:  mathematics
[('science', 0.8271150588989258), ('engineering', 0.7937756776809692), ('physics', 0.7783791422843933), ('discipline', 0.7678360939025879), ('sciences', 0.7533905506134033), ('practice', 0.7275657057762146), ('majors', 0.7274656295776367), ('nanotechnology', 0.7078384160995483), ('concepts', 0.7015020251274109), ('teachers', 0.6945022344589233)]
Most similar to:  console
[('enlarged', 0.8307855129241943), ('airplane', 0.8245298266410828), ('peroxide', 0.8241102695465088), ('adversary', 0.810947597026825), ('acrylate', 0.8101605176925659), ('intermediary', 0.8079272508621216), ('PEB', 0.8065509796142578), ('en', 0.8046492338180542), ('GDP', 0.800044596195221), ('chirped', 0.7992343902587891)]
Most similar to:  spring
[('meridional', 0.7816545963287354), ('ha', 0.7276901006698608), ('Peru', 0.7264009118080139), ('late', 0.7252197265625), ('42', 0.718497097492218), ('north', 0.7100715041160583), ('Tertiary', 0.7027775049209595), ('southern', 0.702316343784332

In [19]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, negative=20)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:48:35,064 : INFO : collecting all words and their counts
2020-03-12 21:48:35,064 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:48:35,594 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:48:35,595 : INFO : Loading a fresh vocabulary
2020-03-12 21:48:35,919 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:48:35,920 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:48:35,986 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:48:35,989 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:48:35,989 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:48:36,045 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:48:36,046 : INFO :

Most similar to:  mathematics
[('discipline', 0.7327420711517334), ('engineering', 0.7312050461769104), ('majors', 0.7304700613021851), ('concepts', 0.6940078139305115), ('physics', 0.6881793737411499), ('science', 0.6670185923576355), ('literacy', 0.6650533676147461), ('careers', 0.6524648666381836), ('instruction', 0.649006187915802), ('teachers', 0.6455676555633545)]
Most similar to:  console
[('ankle', 0.9394474029541016), ('airfoil', 0.9373698830604553), ('chirped', 0.9349576830863953), ('NbTi', 0.9348865747451782), ('Achilles', 0.9297915101051331), ('attenuated', 0.9260783791542053), ('modulator', 0.9256703853607178), ('undocumented', 0.9254209995269775), ('sorbent', 0.925312340259552), ('SEC', 0.9253010153770447)]
Most similar to:  spring
[('austral', 0.7583223581314087), ('creeping', 0.7536946535110474), ('winter', 0.7472171783447266), ('late', 0.7462772130966187), ('downwelling', 0.7435829639434814), ('inception', 0.7423288226127625), ('BC', 0.7378535866737366), ('Tertiary', 0

### Window = 3, 10

On its own, windows doesn't seem more interesting than the others. 

In [20]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, window=3)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:49:07,308 : INFO : collecting all words and their counts
2020-03-12 21:49:07,309 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:49:07,847 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:49:07,848 : INFO : Loading a fresh vocabulary
2020-03-12 21:49:07,929 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:49:07,929 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:49:07,992 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:49:07,994 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:49:07,995 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:49:08,051 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:49:08,052 : INFO :

Most similar to:  mathematics
[('engineering', 0.7921909093856812), ('science', 0.7879029512405396), ('discipline', 0.7671724557876587), ('nanotechnology', 0.7544541358947754), ('literacy', 0.7493030428886414), ('practice', 0.7444681525230408), ('sciences', 0.7409480810165405), ('statistics', 0.739732563495636), ('physics', 0.7288577556610107), ('bioinformatics', 0.7244324088096619)]
Most similar to:  console
[('interferometer', 0.8811743259429932), ('inhibitor', 0.8769178986549377), ('epithelium', 0.8633572459220886), ('fibrosis', 0.8618849515914917), ('gabbro', 0.8556831479072571), ('Aspect', 0.8556816577911377), ('peroxide', 0.8534140586853027), ('photodetector', 0.8525289297103882), ('sonoluminescence', 0.852136492729187), ('SSV', 0.8492079973220825)]
Most similar to:  spring
[('late', 0.7845708131790161), ('winter', 0.7706813216209412), ('interglacial', 0.7648404240608215), ('Tertiary', 0.7565097808837891), ('20th', 0.7413555383682251), ('24', 0.7385187149047852), ('shrublands', 0

In [21]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size (int), window(int), min_count(int), iter(int), sg, negative
vectors = models.Word2Vec(tokenized_text, window=10)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:49:23,334 : INFO : collecting all words and their counts
2020-03-12 21:49:23,334 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:49:23,887 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:49:23,888 : INFO : Loading a fresh vocabulary
2020-03-12 21:49:24,189 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:49:24,190 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:49:24,261 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:49:24,264 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:49:24,265 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:49:24,321 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:49:24,322 : INFO :

Most similar to:  mathematics
[('science', 0.7861557602882385), ('discipline', 0.765587329864502), ('elementary', 0.7522879838943481), ('engineering', 0.7503076791763306), ('physics', 0.73476642370224), ('majors', 0.7052773237228394), ('sciences', 0.7045080661773682), ('mathematical', 0.6979876756668091), ('physicists', 0.6924373507499695), ('subjects', 0.6894881725311279)]
Most similar to:  console
[('fluorometer', 0.855572521686554), ('AES', 0.8468203544616699), ('12CO2', 0.8442041873931885), ('cabinetry', 0.8427667021751404), ('apartment', 0.8338295221328735), ('quadrapole', 0.8321656584739685), ('couplers', 0.8253879547119141), ('collector', 0.824892520904541), ('Actuator', 0.8238655924797058), ('GaMnN', 0.8224331736564636)]
Most similar to:  spring
[('meridional', 0.7854409217834473), ('unmapped', 0.7782730460166931), ('cruises', 0.7611277103424072), ('winter', 0.759924054145813), ('late', 0.7491304278373718), ('1999', 0.7395031452178955), ('monsoon', 0.7293879985809326), ('Peru',

# Results for 2a.

First off; spring was apparently a terrible word to choose since there is nothing that seems to be close to it, winter does pop up occasionally but for all settings it feels kind of random what it picks to be similar.

For me the most interesting variables is the skipgram. Incresing iterations in this scenario also felt valuable. 

### Comparing 2002 vs full abstracts

In [22]:
def extract_abstracts():  
    with zipfile.ZipFile("abstracts.zip", "r") as file:
        file.extractall()
        print("done")

In [23]:
def find_all_files(dir):
    
    paths =  []
    
    for dirpath, dirnames, filenames in os.walk(dir):
        for filename in [f for f in filenames if f.endswith(".txt")]:
            
            path = os.path.join(dirpath, filename)
            paths.append(path)
            
    return paths

In [24]:
paths = find_all_files("abstracts/")

In [25]:
full_docu = []

for path in paths:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        full_docu.append(f.read())
        
random.shuffle(full_docu)

In [26]:
seed_words = ["mathematics", "console", "spring", "technology", "communication"]
tfidf_vectorizer = TfidfVectorizer()
word_tokenizer = tfidf2_vectorizer.build_tokenizer()
tokenized_text_base = [word_tokenizer(doc) for doc in documents]
tokenized_text_full = [word_tokenizer(doc) for doc in full_docu]

In [27]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size, window, min_count, iter, sg, negative
vectors = models.Word2Vec(tokenized_text_base)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:51:42,833 : INFO : collecting all words and their counts
2020-03-12 21:51:42,837 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:51:43,905 : INFO : collected 113911 word types from a corpus of 3681650 raw words and 9923 sentences
2020-03-12 21:51:43,906 : INFO : Loading a fresh vocabulary
2020-03-12 21:51:43,998 : INFO : effective_min_count=5 retains 27041 unique words (23% of original 113911, drops 86870)
2020-03-12 21:51:43,998 : INFO : effective_min_count=5 leaves 3552589 word corpus (96% of original 3681650, drops 129061)
2020-03-12 21:51:44,062 : INFO : deleting the raw counts dictionary of 113911 items
2020-03-12 21:51:44,065 : INFO : sample=0.001 downsamples 54 most-common words
2020-03-12 21:51:44,066 : INFO : downsampling leaves estimated 2916612 word corpus (82.1% of prior 3552589)
2020-03-12 21:51:44,121 : INFO : estimated required memory for 27041 words and 100 dimensions: 35153300 bytes
2020-03-12 21:51:44,121 : INFO :

Most similar to:  mathematics
[('science', 0.8101097345352173), ('engineering', 0.8010798692703247), ('discipline', 0.7948856353759766), ('physics', 0.747256875038147), ('practice', 0.7466083765029907), ('sciences', 0.7356523871421814), ('majors', 0.7277352809906006), ('literacy', 0.7165915966033936), ('concepts', 0.703219473361969), ('mathematical', 0.6995234489440918)]
Most similar to:  console
[('chirped', 0.9077774286270142), ('0017', 0.8973997831344604), ('AES', 0.8972253799438477), ('steerable', 0.8968645334243774), ('kN', 0.8956530690193176), ('NbTi', 0.8947874307632446), ('Eastmark', 0.8940984606742859), ('microtomography', 0.8940542936325073), ('01063', 0.8935415148735046), ('PB', 0.8926355838775635)]
Most similar to:  spring
[('late', 0.7497243881225586), ('interglacial', 0.744472861289978), ('austral', 0.7415764331817627), ('winter', 0.7167822122573853), ('inception', 0.7161902189254761), ('meridional', 0.7069535851478577), ('Miocene', 0.7060791850090027), ('23', 0.705158233

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# test with size, window, min_count, iter, sg, negative
vectors = models.Word2Vec(tokenized_text_full)
print("Most similar to: ", seed_words[0])
print(vectors.wv.most_similar(seed_words[0]))
print("Most similar to: ", seed_words[1])
print(vectors.wv.most_similar(seed_words[1]))
print("Most similar to: ", seed_words[2])
print(vectors.wv.most_similar(seed_words[2]))
print("Most similar to: ", seed_words[3])
print(vectors.wv.most_similar(seed_words[3]))
print("Most similar to: ", seed_words[4])
print(vectors.wv.most_similar(seed_words[4]))

2020-03-12 21:52:01,585 : INFO : collecting all words and their counts
2020-03-12 21:52:01,586 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-12 21:52:02,420 : INFO : PROGRESS: at sentence #10000, processed 3096593 words, keeping 110456 word types
2020-03-12 21:52:03,277 : INFO : PROGRESS: at sentence #20000, processed 6188649 words, keeping 172944 word types
2020-03-12 21:52:04,263 : INFO : PROGRESS: at sentence #30000, processed 9295825 words, keeping 226132 word types
2020-03-12 21:52:05,450 : INFO : PROGRESS: at sentence #40000, processed 12380992 words, keeping 274592 word types
2020-03-12 21:52:06,497 : INFO : PROGRESS: at sentence #50000, processed 15482098 words, keeping 319960 word types
2020-03-12 21:52:07,875 : INFO : PROGRESS: at sentence #60000, processed 18571217 words, keeping 362506 word types
2020-03-12 21:52:09,155 : INFO : PROGRESS: at sentence #70000, processed 21671727 words, keeping 403470 word types
2020-03-12 21:52:10,382 : IN

2020-03-12 21:54:30,558 : INFO : EPOCH 1 - PROGRESS: at 76.66% examples, 500376 words/s, in_qsize 6, out_qsize 2
2020-03-12 21:54:31,571 : INFO : EPOCH 1 - PROGRESS: at 77.93% examples, 498738 words/s, in_qsize 3, out_qsize 2
2020-03-12 21:54:32,619 : INFO : EPOCH 1 - PROGRESS: at 79.28% examples, 497396 words/s, in_qsize 3, out_qsize 2
2020-03-12 21:54:33,635 : INFO : EPOCH 1 - PROGRESS: at 80.52% examples, 495658 words/s, in_qsize 3, out_qsize 1
2020-03-12 21:54:34,635 : INFO : EPOCH 1 - PROGRESS: at 81.70% examples, 493701 words/s, in_qsize 5, out_qsize 2
2020-03-12 21:54:35,680 : INFO : EPOCH 1 - PROGRESS: at 82.87% examples, 491431 words/s, in_qsize 5, out_qsize 3
2020-03-12 21:54:36,684 : INFO : EPOCH 1 - PROGRESS: at 84.05% examples, 489586 words/s, in_qsize 5, out_qsize 0
2020-03-12 21:54:37,690 : INFO : EPOCH 1 - PROGRESS: at 85.12% examples, 487248 words/s, in_qsize 5, out_qsize 0
2020-03-12 21:54:38,725 : INFO : EPOCH 1 - PROGRESS: at 86.31% examples, 485284 words/s, in_qsiz

### Results of 2002 vs full comparison:

The model has clearly learnt a lot more with that much more data. It's hard to quantify exactly at what amount of documents it did learn this much, however in for example the spring example, it clearly recognizes that it is a season and that the other seasons are similar to it.

Similarly the other similarities also got, what I feel, a lot more accurate.

### 2b. elmo

In [None]:
#!pip install tensorflow==1.15
#!pip install "tensorflow_hub>=0.6.0"
#!pip3 install tensorflow_text==1.15

import tensorflow as tf
import tensorflow_hub as hub

elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3",signature="default", as_dict=True trainable=True)

In [None]:
def elmo_vectors(sents):
    embeddings = elmo(sents, )["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        return sess.run(embeddings)

In [None]:
def elmo_vectors(sents):
    embeddings = elmo(sents, signature="default", as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        return sess.run(embeddings)
        #sess.run(tf.tables_initializer())
        # return average of ELMo features as sentence vector
        #return sess.run(tf.reduce_mean(embeddings,1))

In [None]:
sents = """The game ended quickly .
He hunted some game for dinner .
A game of swans in the river .*
They played a game of chess .
They were in a baseball game .
She decided to eat som game .
Game can be found in forests .
Counterstrike is a popular game .
They didn't follow the game .
It was time to game .""".split('\n')

target = "game"

elmo_vecs = elmo_vectors(sents)
word_vecs = []
for i, sent in enumerate(sents):
    word_vecs.append(elmo_vecs[i][sent.split().index(target)])
    print("Sentence: ", sent)
    print("Vector for '%s:'" % target, word_vecs[-1])
    print()
    
print("Word vec size", word_vecs[0].shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

vec_size = word_vecs[0].shape[0]
print("Similarities between '%s' vector in sentences:" % target)
for i in range(1, len(sents)):
    print("Sent 0-%d:" % i, cosine_similarity(word_vecs[0].reshape((1,vec_size)), 
                                              word_vecs[i].reshape((1,vec_size)))[0][0])