In [None]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
}
.code_block {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-size: 75%;
    line-height: 22px; /* 5px +12px + 5px */
    #text-indent: 25px;
    #background-color: #fbfbea;
    padding: 5px;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h2>
<center>
Can we Parallelize KNN?
</center>
</h2>

In [1]:
import os
import sys
import subprocess

In [2]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

In [3]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

0

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().set('spark.executor.cores', 1).set('spark.executor.memory', '2g')
spark = SparkSession(SparkContext(master='spark://instance-5:7077', conf=conf))
spark

<div class=h1_cell>
<p>
Sometimes its better think about these problems from the bottom up. Is there a row-wise operation that can be run on the dataset in parallel?
<p>
Yes. Given a row, we can calculate the distance from every other row in the distributed dataset.
<p>
Lets import our dataset using pandas and use our NLP code from pyspark_lsa.ipynb to process the text.
</div>

In [None]:
!pip install pandas  # if not already installed

In [5]:
import pandas as pd

pd.set_option('display.max_columns', 500)
gothic_table = pd.read_csv('https://bit.ly/2HVSx3X', encoding='utf-8')
gothic_table.head(5)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


<div class=h1_cell>
<p>
Generating the cooccurence matrix for the first 10 sentences produces a matrix with 180 columns. The first 100 sentences produces 1149 columns. Including all sentences would produce an incredibly large and sparse matrix, which takes too long and too much space to do here. I'm simply using the first 100 sentences.
<p>    
If you want the entire matrix, I suggest you simply use the commented below to read in the already-computed csv. The csv file is just over 1 GB, it requires about 11 GB to load the csv file into python, and the pyobject itself (cm) is just under 5 GB. This means you'll probably have to create a new head node with more memory, read below. (or, just use the first 100)
<p>
If you want or need to compute it, I suggest you pull this repo on your local computer and compute it there. If you want to do it in the cloud, destroy this node and spin up a new disk with at least 20GB of space. Then, install spark and rewire your cluster configuration to connect to the new node as the master. The code below needs about 10-15 GB of memory at maximum to compute the matrix. The other more complex route is to add another disk and merge it into the main partition, see https://cloud.google.com/compute/docs/disks/add-persistent-disk
</div>

In [None]:
!pip install nltk  # if not already installed

In [6]:
"""
If you want the whole thing:

cm = pd.read_csv("three_authors.csv", index_col=0)
cm.head()
"""


from nlp import get_bag_and_tokenize
from lnalg import comatrix

bag, sentences = get_bag_and_tokenize(gothic_table.head(100), 'text')
cm = comatrix(bag, sentences, window=3)
cm.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jakeu123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,groundwork,caused,four,ceased,habiliments,hanging,follow,children,increase,referring,former,sputter,safety,hideous,far,horror,robes,suggesting,every,minded,fall,awful,vast,recollections,sunk,clothed,conceive,herbert,singularly,compasses,nigh,breakers,surrounding,rate,street,design,pass,air,even,whar,solemn,deeply,poison,pulpit,new,learned,ever,exclamations,full,never,wrapping,hours,met,tracks,protection,maouth,respect,eliciting,contraction,speeches,ahead,changes,fantastic,followed,secure,suddenly,atomies,visibly,brought,beheld,glance,epicurus,readily,abhorrent,eye,would,sheehan,two,call,taken,survive,albertus,tell,door,substances,brings,wars,aware,crimson,particular,known,embroidered,must,town,word,room,pursue,work,dew,peals,slovenly,mr,precociously,fancying,marble,visage,give,terrible,states,orion,fanciful,numbers,sense,phrase,times,information,end,inhabitants,machine,glowing,occasionally,boats,fever,beauty,sordidness,plane,lay,coming,gasp,profited,bugs,deserted,greek,dread,shade,rheumatism,aloud,shrank,unsteady,replied,interpretation,gauges,incessantly,soon,held,signs,gallopin,nets,inquiries,crew,late,feel,detected,condition,paracelsus,might,overcome,good,return,weakness,thee,largest,potentates,foot,sixteen,events,merrival,victory,abaout,found,went,friendship,side,heavy,afforded,everyone,needless,series,energy,idea,loops,grandfather,sledges,really,large,velvet,since,reverend,acting,sustain,hill,paradise,got,impertinence,carcass,roofs,revenge,pleasant,difficulty,ancient,whereupon,struggle,murmur,companionless,teach,beginning,generate,bushes,days,cheering,conducted,circuit,kingdom,guesswork,render,grade,attractions,hoop,feet,another,passes,introduction,twenty,top,obedience,least,needed,wonderful,passed,inconvenient,scheme,nooses,storm,geneva,behind,villages,lavished,took,immediate,wrapped,part,western,somewhat,peculiar,kind,grew,exultation,enabled,grey,youth,...,definite,colours,outside,complexion,horrible,progress,neither,drest,across,glen,notice,various,importance,underduk,court,courtyard,countenance,come,ribands,discernible,instant,wilbur,many,region,equal,conquered,expression,journey,grounds,among,fatality,wall,satisfaction,table,bizarre,comprehend,boat,better,tremendous,west,window,cared,dangers,hardly,perplexed,negligently,twinge,constantinople,deep,rudimentary,bull,spirit,present,case,abandoned,value,cast,canoe,almost,brow,thus,ellison,surface,piteously,pursy,evening,sits,ship,perhaps,unite,parts,clammy,document,exhaling,week,noble,extended,raymond,disadvantages,upon,persons,running,compared,delicate,delirium,innsmouth,changing,idris,totally,steals,destruction,reflection,elipse,well,without,english,unfitted,drawing,draconian,usual,cried,less,moments,money,republicanism,cloaks,rest,rooms,aspect,protecting,roomy,death,thinking,provision,shrines,sky,except,overthrow,engulfed,struck,characteristically,resolved,around,ample,dark,know,facial,desk,world,blankets,vague,dare,furniture,fortune,like,stranger,perpetual,superiority,either,tower,glare,underwent,astronomer,snuffy,interment,stagecoach,provided,dimensions,lead,cavity,avoid,shone,cosmogony,bursts,speak,panes,felt,pew,power,lieutenant,murdered,leadership,prohibited,hectic,refuge,gained,stone,oh,island,violence,discussed,whence,fell,nothing,greatly,act,tongue,persuaded,road,fallacy,tribe,equally,rendered,letters,wilderness,instruments,spreading,chronic,mere,breath,noxious,assumed,spread,idee,powdered,decisive,forrad,building,regard,prophecies,plague,cottage,remote,ear,minutely,made,glory,richly,unrecognised,excitement,placed,limit,tones,infants,grass,affection,profusion,gone,certain,moved,watches,general,imagination,carved,fumbling,politics,selfish,beyond,administered,wheaton,lately,old,details,sick,conclusion,poor,star,monday,symbol,drift,gifts,chance,peak,fresh,islanders,brass,draperies,pestilence,rural,maturity
groundwork,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
caused,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
four,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ceased,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
habiliments,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
import sys

sys.getsizeof(cm)  # 4.8 GB max

10702020

<div class=h1_cell>
<p>
Lets convert of pandas dataframe to a spark dataframe. We can pass a python data structure or a pandas dataframe itself. However, we will ultimately need to pass in a single column with all the data in it, so lets create that.
</div>

In [8]:
df = spark.createDataFrame([[word] for word in cm.values.tolist()], schema=["features"])
df.printSchema()
df.show(5)

root
 |-- features: array (nullable = true)
 |    |-- element: long (containsNull = true)

+--------------------+
|            features|
+--------------------+
|[0, 0, 0, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
+--------------------+
only showing top 5 rows



<div class=h1_cell>
<p>
This is good, but we can make it better. Spark can use 'SparseVectors' to represent arrays with mostly zeros. This cuts down on the space needed to store our features vector. We'll pass in the pandas dataframe, create the features vector using SparseVector, and drop the rest of the columns.
</div>

In [9]:
cm['idx'] = cm.index
cmdf = spark.createDataFrame(cm, schema=list(cm.columns))  # pass the pandas dataframe straight in
cmdf.select('idx', 'gold', 'groundwork', 'desk', 'fantastic', 'years').show(5)  # many, many columns

+-----------+----+----------+----+---------+-----+
|        idx|gold|groundwork|desk|fantastic|years|
+-----------+----+----------+----+---------+-----+
| groundwork|   0|         0|   0|        0|    0|
|     caused|   0|         0|   0|        0|    0|
|       four|   0|         0|   0|        0|    0|
|     ceased|   0|         0|   0|        0|    0|
|habiliments|   0|         0|   0|        0|    0|
+-----------+----+----------+----+---------+-----+
only showing top 5 rows



In [10]:
del cm  # Worth it

In [11]:
from pyspark.ml.feature import VectorAssembler

cols = list(cmdf.columns)
cols.remove('idx')
vdf = VectorAssembler(inputCols=cols, outputCol="vectors").transform(cmdf)
vdf = vdf.drop(*cols)
vdf.show(5)

+-----------+--------------------+
|        idx|             vectors|
+-----------+--------------------+
| groundwork|(1149,[156,441,68...|
|     caused|(1149,[11,500,531...|
|       four|(1149,[51,118,222...|
|     ceased|(1149,[24,65,234,...|
|habiliments|(1149,[105,420,59...|
+-----------+--------------------+
only showing top 5 rows



In [12]:
type(vdf.head().vectors)

pyspark.ml.linalg.SparseVector

In [13]:
type(vdf.head())

pyspark.sql.types.Row

<div class=h1_cell>
<p>
Lets create a function that calculates the distance against the first word in the dataset. We can pass python functions to spark, and spark will pass each row in the distributed dataset.
</div>

In [17]:
import numpy as np

head = vdf.head().vectors

def distance(x):
    return np.linalg.norm(x-head.toArray()).item()

vdf.head()

Row(idx=u'groundwork', vectors=SparseVector(1149, {156: 1.0, 441: 1.0, 682: 1.0, 747: 1.0, 846: 1.0, 875: 1.0}))

In [18]:
distance(vdf.head().vectors)  # should be zero

0.0

<div class=h1_cell>
<p>
Lets spark-ify that distance function.
</div>

In [19]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

dist = udf(distance, FloatType())  # spark user-defined-function for distance

In [20]:
vdf = vdf.withColumn('distance', dist('vectors'))

In [22]:
vdf.orderBy(vdf.distance).show(10)

+----------+--------------------+---------+
|       idx|             vectors| distance|
+----------+--------------------+---------+
|groundwork|(1149,[156,441,68...|      0.0|
| character|(1149,[0,156,489,...|      2.0|
|   refined|(1149,[0,441,682,...|      2.0|
|    cannot|(1149,[0,156,441,...|2.4494898|
| fosterage|(1149,[0,427,441,...|2.4494898|
|   intense|(1149,[156,441,45...|2.4494898|
|  feminine|(1149,[0,427,682,...| 2.828427|
|  overcome|(1149,[0,441,489,...| 2.828427|
|    talked|(1149,[38,556],[1...| 2.828427|
|     tombs|(1149,[38,586],[1...| 2.828427|
+----------+--------------------+---------+
only showing top 10 rows



<div class=h1_cell>
<p>
We're only using the first 100 sentences. While the above words may not be similar or related to the first word, using more data (sentences) would probably improve the model's results.
</div>