In [1]:
cd fastText_multilingual-master

/Users/christanasescu/fastText_multilingual-master


In [2]:
import numpy as np
#import FastVector

In [3]:
import scipy

In [4]:
import pybind11

In [5]:
class FastVector:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, for export() and translate methods
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
            (self.n_words, self.n_dim) = \
                (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])
        
        # Used in translate_inverted_softmax()
        self.softmax_denominators = None
        
        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)

    def apply_transform(self, transform):
        """
        Apply the given transformation to the vector space
        Right-multiplies given transform with embeddings E:
            E = E * transform
        Transform can either be a string with a filename to a
        text file containing a ndarray (compat. with np.loadtxt)
        or a numpy ndarray.
        """
        transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
        self.embed = np.matmul(self.embed, transmat)

    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()

    def translate_nearest_neighbour(self, source_vector):
        """Obtain translation of source_vector using nearest neighbour retrieval"""
        similarity_vector = np.matmul(FastVector.normalised(self.embed), source_vector)
        target_id = np.argmax(similarity_vector)
        return self.id2word[target_id]

    def translate_inverted_softmax(self, source_vector, source_space, nsamples,
                                   beta=10., batch_size=100, recalculate=True):
        """
        Obtain translation of source_vector using sampled inverted softmax retrieval
        with inverse temperature beta.
        nsamples vectors are drawn from source_space in batches of batch_size
        to calculate the inverted softmax denominators.
        Denominators from previous call are reused if recalculate=False. This saves
        time if multiple words are translated from the same source language.
        """
        embed_normalised = FastVector.normalised(self.embed)
        # calculate contributions to softmax denominators in batches
        # to save memory
        if self.softmax_denominators is None or recalculate is True:
            self.softmax_denominators = np.zeros(self.embed.shape[0])
            while nsamples > 0:
                # get batch of randomly sampled vectors from source space
                sample_vectors = source_space.get_samples(min(nsamples, batch_size))
                # calculate cosine similarities between sampled vectors and
                # all vectors in the target space
                sample_similarities = \
                    np.matmul(embed_normalised,
                              FastVector.normalised(sample_vectors).transpose())
                # accumulate contribution to denominators
                self.softmax_denominators \
                    += np.sum(np.exp(beta * sample_similarities), axis=1)
                nsamples -= batch_size
        # cosine similarities between source_vector and all target vectors
        similarity_vector = np.matmul(embed_normalised,
                                      source_vector/np.linalg.norm(source_vector))
        # exponentiate and normalise with denominators to obtain inverted softmax
        softmax_scores = np.exp(beta * similarity_vector) / \
                         self.softmax_denominators
        # pick highest score as translation
        target_id = np.argmax(softmax_scores)
        return self.id2word[target_id]

    def get_samples(self, nsamples):
        """Return a matrix of nsamples randomly sampled vectors from embed"""
        sample_ids = np.random.choice(self.embed.shape[0], nsamples, replace=False)
        return self.embed[sample_ids]

    @classmethod
    def normalised(cls, mat, axis=-1, order=2):
        """Utility function to normalise the rows of a numpy array."""
        norm = np.linalg.norm(
            mat, axis=axis, ord=order, keepdims=True)
        norm[norm == 0] = 1
        return mat / norm
    
    @classmethod
    def cosine_similarity(cls, vec_a, vec_b):
        """Compute cosine similarity between vec_a and vec_b"""
        return np.dot(vec_a, vec_b) / \
            (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [6]:
#import numpy as np
from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [7]:
# fr_dictionary = FastVector(vector_file='wiki.fr.vec')
en_dictionary = FastVector(vector_file='wiki.en.vec')

reading word vectors from wiki.en.vec


In [8]:
# fr_words = set(fr_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())
# overlap = list(en_words & fr_words)
# bilingual_dictionary = [(entry, entry) for entry in overlap]

In [9]:
en_words = list(en_words)

In [9]:
# form the training matrices
# source_matrix, target_matrix = make_training_matrices(
    # fr_dictionary, en_dictionary, bilingual_dictionary)

In [10]:
# learn and apply the transformation
# transform = learn_transformation(source_matrix, target_matrix)
# fr_dictionary.apply_transform(transform)

In [11]:
# As we can see above 'chien' and 'canis' are now much more similar than before. 

In [10]:

import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
import os
#The OS module in Python provides a way of using operating system dependent functionality. 
#The functions that the OS module provides allows you to interface with the underlying operating system 
#that Python is running on – be that Windows, Mac or Linux.

from os import listdir
from os.path import isfile, join

# Gensim
import gensim
import gensim.corpora as corpora
from gensim import models, corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [11]:
import nltk as nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christanasescu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
#stopwords = nltk.corpus.stopwords.words("stopwords_Latin.txt")

In [12]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = _pre_clean(tokens)
    tokens = [token for token in tokens if len(token) > 0]
    tokens = [token for token in tokens if token not in stopwords]
    #tokens = [get_lemma(token) for token in tokens]
    return tokens

In [13]:
def _pre_clean(list_of_text):
        '''
        preliminary cleaning of the text
        - remove new line character i.e. \n or \r
        - remove tabs i.e. \t
        - remove extra spaces
        '''
        cleaned_list = []
        for text in list_of_text:
            # print("original:", text)
            text = text.replace('\\n', ' ')
            text = text.replace('\\r', ' ')
            text = text.replace('\\t', ' ')
            pattern = re.compile(r'\s+')
            text = re.sub(pattern, ' ', text)
            text = text.strip()
            text = text.lower()
            # check for empty strings
            if text != '' and text is not None:
                cleaned_list.append(text)

        return cleaned_list

In [17]:
#filelabels = {}

#import re as re 

#def get_documents(path):
    #os.chdir(path)
    #files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    #texts = []
    #count = -1
    #for f in files:
        #with open(f, "r", encoding='utf-8', errors = 'ignore') as openf:
            #count = count + 1
            #filelabels[count] = os.path.basename(openf.name)
            #splitted_lines = openf.read().splitlines()
            #splitted_lines = _pre_clean(splitted_lines)
            #texts.append(splitted_lines)
    #print(filelabels)
    #return texts

#TEXTS_DIR = HOME + "/UK_EN/"

#documents = get_documents(TEXTS_DIR)

In [14]:

stopwords = nltk.corpus.stopwords.words('stop_words_poetry.txt')

stopwords.append('...')
stopwords.append("'d")
stopwords.append('...')
stopwords.append("&")
stopwords.append("upon")
stopwords.append("also")
stopwords.append("hath")
stopwords.append("must")
stopwords.append("therefore")
stopwords.append("doth")
stopwords.append("could")
stopwords.append("would")
#stopwords.append("another")
stopwords.append("much")
#stopwords.append("give")
stopwords.append("like")
stopwords.append("since")
#stopwords.append("many")
#stopwords.append("without")
#stopwords.append("first")
stopwords.append("though")
#stopwords.append("well")
#stopwords.append("often")
#stopwords.append("great")
stopwords.append("either")
#stopwords.append("even")
stopwords.append("shall")
#stopwords.append("they")
stopwords.append("what")
stopwords.append("their")
#stopwords.append("more")
#stopwords.append("there")
#stopwords.append("your")
#stopwords.append("them")
stopwords.append("’")
stopwords.append("“")
stopwords.append("2")
stopwords.append("3")
stopwords.append("”")

In [15]:
stopwords.extend(['a', 'like', 'you', 'they', 'he', 'be', 'it', 'your', 'her', 'of', 'more', 'there', 'no', 'not', '’', 'what', 'my', 'his', 'she', 'to', 'our', 'me', 'we', 'in', 'can', 'us', 'an', 'if', 'do', 'this', '”', 'because', 'who', 'hand', 'but', 'him'])

In [16]:
HOME = os.getcwd()

TEXTS_DIR = HOME + "/vector_prosody_experiment/"

#TEXTS_DIR = HOME

filelabels_en = {}

texts_data = []

files = [f for f in os.listdir(TEXTS_DIR) if os.path.isfile(os.path.join(TEXTS_DIR, f))]

import string
from string import punctuation

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

tokens_total = []

count = -1
 
os.chdir(TEXTS_DIR)
    
for f in files:
    #os.chdir(TEXTS_DIR)
    with open(f, "r", encoding='utf-8', errors = 'ignore') as openf:
        tokens = []
        count = count + 1
        filelabels_en[count] = os.path.basename(openf.name)
        for line in openf:
            sent_text = nltk.sent_tokenize(line)
            for sentence in sent_text:
                tokens1 = tokenize(sentence)
                tokens1 = [item.translate(remove_punct_map)
                      for item in tokens1]
                #filter_object = filter(lambda x: x != "", tokens1)
                tokens1 = [x for x in tokens1 if x!= ""]
                for token in tokens1:
                    tokens.append(token)
                    tokens_total.append(token)
                #if random.random() > .99:
                #print(tokens)
    #print(tokens_total)
    texts_data.append(tokens)

print(filelabels_en)

{0: 'charles_wright_apologia_pro_vita_sua.txt', 1: 'elke_de_rijcke_deffectiveness.txt', 2: 'martinus_nijhoff_trans_james_s_holmes_awater_excerpt.txt', 3: 'place_andreea_scridon_bucharest_cartography_romania.txt', 4: 'place_babylonians-trans-jared-pearce_high-priest-prayer.txt', 5: 'place_david-baker_weed_granville-ohio-&-ontario-&-illinois-&-eerie-&-carolina-&-asia.txt', 6: 'place_liliane_wouters_trans_margento_a_pascal_bill_brabant_flanders_belgium.txt', 7: 'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt', 8: 'sappho_31_trans_julia_dubnoff_trans_chris_childers_trans_anne_carson_walt_whitman_woman_waits_for_me.txt', 9: 'sharon_olds_the_knowing.txt', 10: 'ted_hughes_harvest_moon.txt', 11: 'verheggen_portrait_trans_taylor.txt', 12: 'veronique_bergen_trans_margento_wolves.txt', 13: 'w_h_auden_brussels_in_winter.txt'}


In [17]:
filelabels1 = list(filelabels_en)

In [25]:

#print(filelabels_en)

{0: 'charles_wright_apologia_pro_vita_sua.txt', 1: 'elke_de_rijcke_deffectiveness.txt', 2: 'place_babylonians-trans-jared-pearce_high-priest-prayer.txt', 3: 'place_david-baker_weed_granville-ohio-&-ontario-&-illinois-&-eerie-&-carolina-&-asia.txt', 4: 'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt', 5: 'sappho_31_trans_julia_dubnoff_trans_chris_childers_trans_anne_carson_walt_whitman_woman_waits_for_me.txt', 6: 'sharon_olds_the_knowing.txt', 7: 'ted_hughes_harvest_moon.txt', 8: 'veronique_bergen_trans_margento_wolves.txt', 9: 'w_h_auden_brussels_in_winter.txt'}


playlist:

charles wright, "apologia pro vita sua" (vect_en[0])
rachel blau duplessis, "draft 56: that" ([4]) 
"old babylonian contracts. loan of barley, from a temple," trans. jared pierce ([2])
w.h. auden, "brussels in winter" ([9]) 
david baker, "weed" ([3])
veronique bergen, "wolfpack" ([8])
elke de rijcke, trans. margento, "my defectiveness facilitated your ground. findings of an emancipated woman. thank you (in the kitchen)" ([1])
sappho (feat. walt whitman), multiple trans. ([5])
sharon olds, "the knowing" ([6])
ted hughes, "harvest moon" ([7])
jean-pierre verheggen, trans. john taylor, "self-portrait"
liliane wouters, trans. margento, "the pascal bill"
martinus nijhoff, trans. james s. holmes, "awater" (excerpt)
andreea iulia scridon, "bucharest cartography (sketch)"

In [18]:
for i in range(len(filelabels1)):
    texts_data[i] = [x for x in texts_data[i] if x not in stopwords]

In [19]:
def l2_norm(x):
   return np.sqrt(np.sum(x**2))

def div_norm(x):
   norm_value = l2_norm(x)
   if norm_value > 0:
       return x * ( 1.0 / norm_value)
   else:
       return x

In [20]:
vect_en = []

for i in range(len(filelabels1)):
        vect1 = []
        for j in range(len(texts_data[i])):
            if texts_data[i][j] in en_words:
                vect1.append(div_norm(en_dictionary[texts_data[i][j]]))
            else:
                print(i, texts_data[i][j])
                continue
        vect0 = sum(vect1) / len(texts_data[i])
        vect_en.append(vect0)

0 partcharred
0 worldweight
0 sapcrippled
0 winterweathered
0 jackwedge
0 boutonnieres
0 disfecemi
0 blossomstarred
0 starcrystals
0 perchio
0 ballatetta
0 thingless
0 thingless
0 engenderer
0 onlybegetter
0 halfclerk
0 greenleafed
1 intumescences
1 ​​progression
2 fanblade
2 soldieress
2 loosetied
2 enswathed
3 autumned
3 greataunttanti
3 30s
3 valentinecard
3 doublestrapped
3 slickedback
3 wroughtiron
3 70s
3 driedup
3 paintedblack
3 altamahaha
3 1988
3 onionbulbed
3 hexagonal…
3 14sectored
3 dirtgrey
3 dirtblue
3 eighthour
3 sixteen…leave
4 damuribam
4 urdatum
5 ashgreen
5 graygreen
5 threebranching
5 footlongat
5 2002
5 75
5 4
6 allsnowy
7 65
7 no—possessive
7 halfturned
7 newlydead
7 alterest
7 sdstated
8 sallower
8 wellpossess
8 undissuadable
8 pentup
8 bestbeloved
8 lovespendings
9 comaed
9 bluegreygreen
9 selfregard
9 nonnomadic
10 flamered
11 faultedhouse
11 hercullion
11 renaud—in
11 tittylon
11 1st
11 1942
11 20th
12 hyperconsciousness
12 ethylism
12 transgrammatical
12 necr

In [21]:

len(vect_en)

14

In [22]:
def vector(x):
    vect1 = []
    for j in range(len(x)):
            if x[j] in stopwords:
                continue
            else:
                if x[j] in en_words:
                    vect1.append(div_norm(en_dictionary[x[j]]))
                else:
                    print(x[j])
                    continue
    vect0 = sum(vect1) / len(x)
    return vect0

In [23]:

from numpy import dot
from numpy.linalg import norm

In [24]:

def cos_sim(x,y):
    return dot(x, y)/(norm(x)*norm(y))

In [None]:

# BOW (bag of words) poetry; not only reading and writing as [not like] an algorithm (Drucker 2021), 
# but also writing for an algorithm reader. 

In [None]:

# line0 = ['road', 'end', 'cobble', 'drunkenness', 'nowhere', 'go', 'keep', 'on', 'eudore', 'pirmez', 'like', 'past', 'clouds', 'tight', 'passage', 'above', 'street', 'stones', 'seen', 'world', 'starting', 'tilt', 'nowhere', 'where', 'were', 'going', 'visible', 'thing', 'thingless', 'we', 'came', 'world', 'thingless', 'we', 'leave', 'still', 'waves', 'bankrupt', 'pubs', 'suicides', 'eye','attracted', 'downward', 'down', 'avenue', 'malou', 'soon', 'killed', 'assimilated', ]

In [23]:
line0 = ['road', 'end', 'cobble', 'drunkenness', 'nowhere', 'go', 'keep', 'on', 'eudore', 'pirmez', 'like', 'past', 'clouds', 'tight', 'passage', 'above', 'street', 'stones', 'seen', 'world', 'starting', 'tilt', 'now', 'here', 'nowhere', 'where', 'were', 'going', 'visible', 'thing', 'thingless', 'we', 'came', 'world', 'thingless', 'we', 'leave', 'still', 'waves', 'bankrupt', 'pubs', 'suicides', 'eye', 'attracted', 'downward', 'down', 'avenue', 'malou', 'soon', 'killed', 'assimilated', 'lift','eyes', 'swallowed', 'saint', 'stone', 'st', 'antoine', 'anti', 'one', 'coine', 'coin', 'antconc', 'passersby', 'mind', 'touch', 'tram', 'ramming', 'heartbeat', 'wreath', 'breath', 'exhaust', 'host', 'hostage', 'traffic', 'clutter', 'gutter', 'utter', 'guttural', 'chocolate', 'store', 'flemish', 'gables', 'turkish', 'pizza', 'place', 'ace', 'room', 'doomed', 'con', 'connect', 'necked', 'echt', 'stuck', 'jam', 'headed', 'all', 'directions']

In [71]:
# 0.7932854356288415line0 = ['brussels', 'rain', 'lost', 'in', 'the', 'minutes','hours', 'thrumming', 'on', 'the', 'brains','trams', 'on', 'tervuren', 'street', 'the', 'gate', 'of', 'cinquantenaire', 'slowly', 'looming', 'through', 'the', 'chestnut', 'trees']

# 0.7630211685597917 line0 = ['brussels', 'rain', 'lost', 'in', 'the', 'minutes','hours', 'thrumming', 'on', 'your', 'brains']

# 0.776370338753413 line0 = ['brussels', 'rain', 'lost', 'in', 'the', 'long', 'minutes', 'long','hours', 'thrumming', 'on', 'your', 'brains']

# 0.8028070124351534 line0 = ['you', 'got', 'lost', 'in', 'the','brussels', 'rain', 'the', 'long', 'minutes', 'long','hours', 'thrumming', 'on', 'your', 'brains']

# 0.807942003059286 line0 = ['you', 'got', 'lost', 'in', 'the','brussels', 'rain', 'the', 'long', 'minutes', 'and', 'hours', 'thrumming', 'on', 'your', 'brains']

# 0.8213554789779391 line0 = ['you', 'got', 'lost', 'in', 'the','brussels', 'rain', 'the', 'long', 'minutes', 'and', 'hours', 'of', 'drops', 'thrumming', 'on', 'your', 'brains']

# 0.8114431534834665 line0 = ['you', 'got', 'lost', 'in', 'the','brussels', 'rain', 'eating', 'away', 'at', 'your', 'nerves']

# 0.7309086216450169 line0 = ['you', 'got', 'lost', 'in', 'the','brussels', 'rain']

In [32]:

#vector(line0)

thingless
thingless
antconc


array([-2.00547545e-02, -1.31303619e-02, -2.83823082e-02,  4.40135590e-02,
       -3.46589276e-02,  6.06369008e-03,  1.74161521e-03, -2.48864032e-02,
        6.92531857e-03,  2.97833488e-02,  1.51574625e-02,  7.69335571e-03,
       -1.72233520e-02, -1.19005449e-02,  2.01595012e-02, -3.33612848e-02,
       -2.11208877e-02, -6.01240780e-03,  7.71446720e-03,  4.85893336e-02,
       -3.22472025e-02,  2.97685730e-02, -4.16689236e-02, -3.64222935e-02,
       -2.11171391e-02, -1.11978547e-02, -4.47659741e-03, -6.33765830e-03,
        4.03494891e-03,  1.92838304e-02, -2.61829020e-02,  5.35845970e-02,
       -2.98444350e-02,  2.35333223e-02,  1.46818314e-02, -2.12395182e-02,
        3.56956414e-03, -2.87006031e-02,  1.81089758e-03, -1.05279524e-02,
        1.83347159e-02,  8.99918481e-03, -9.27684829e-03,  4.68453597e-03,
        1.36936375e-02,  2.58961238e-02,  1.07507422e-02,  4.52908378e-03,
        5.55842817e-03, -9.13369141e-03,  9.56861299e-03, -3.39959497e-02,
        3.31530431e-03, -

In [33]:
print (cos_sim(vector(line0), vect_en[0]))

thingless
thingless
antconc
0.9474654303367803


In [36]:

line0 = ['road', 'end', 'cobble', 'drunkenness', 'nowhere', 'go', 'keep', 'on', 'eudore', 'pirmez', 'like', 'past', 'clouds', 'tight', 'passage', 'above', 'street', 'stones', 'seen', 'world', 'starting', 'tilt', 'now', 'here', 'nowhere', 'where', 'were', 'going', 'visible', 'thing', 'thingless', 'we', 'came', 'world', 'thingless', 'we', 'leave', 'still', 'waves', 'bankrupt', 'pubs', 'suicides', 'eye', 'attracted', 'downward', 'down', 'avenue', 'malou', 'soon', 'killed', 'assimilated', 'lift','eyes', 'swallowed', 'saint', 'stone', 'st', 'antoine', 'anti', 'one', 'le','coin', 'my','coins', 'antconc', 'passersby', 'mind', 'touch', 'tram', 'ramming', 'heartbeat', 'wreath', 'breath', 'exhaust', 'host', 'hostage', 'traffic', 'clutter', 'gutter', 'utter', 'guttural', 'chocolate', 'store', 'flemish', 'gables', 'turkish', 'pizza', 'place', 'ace', 'room', 'doomed', 'con', 'connect', 'necked', 'echt', 'stuck', 'jam', 'headed', 'all', 'directions']

In [35]:

print (cos_sim(vector(line0), vect_en[0]))

thingless
thingless
antconc
0.949095145932522


In [24]:

lineX = ['your', 'org', 'organ', 'organon', 'no', 'no', 'skin', 'mem', 'brain', 'membrane', 'rains', 'spasm', 'spastic', 'screams', 'on', 'iphone', 'screen', 'spawning', 'awning', 'dawn', 'chris', '$', 'trains', 'rains', 'cage', 'cagey', 'christ', 'st', 'mone', 'money', 'y', 'suis', 'en', 'train', 'wall', 'ache', 'wallachia', 'for', 'm', 'bandied', 'we', 'unda', 'munda', 'mundaneum', 'mon', 'mons', 'moans', 'soon', 'monsoon', 'con', 'fence', 'conference', 'in', 'our', 'reference', 'pneuma', 'cult', 'ur', 'culture', 'shared', 'me', 'mode', 'de', 'res', 'mores', 'memories', 's', 'i', 'cent', 're', 'centre', 'centuries', 'on', 'pa', 'paper', 'her', 'sphere', 'ph', 'prow', 'cutting', 's', 'spires', 'pyres', 'am', 'empires', 'out', 'of', 'the', 'pic', 'moderate', 'modern', 'e', 'very', 'thin', 'g', 'everything', 'war', 'all', 'warhol', 'blood', 'wave', 'wag', 'ave', 'vero', 'ro', 'roaring', 'go', 'out', 'gout', 'outlet', 'otlet', 'any', 'body', 'anybody', 'neu', 'eu', 'let', 'dying', 'knew', 'touch', 'touching', 'she', '$','he', 'had', 'put', 'in', 'be', 'bell', 'for', '€', 'fore', 'before', 'p', 'ass', 'passing', 'anybody', 'i', 'know', 'i', 'will', 'die', 'if', 'i', 'd', 'dont', 'ont', 'k', 'now', 'know', 'never', 'will', 'pont', 'inside', 'y', 'our', 'your', 'vague', 'ue', 'vagina', 'hear', 't', 'heart', 'be', 'at', 'beat'] 

In [None]:
# {0: 'charles_wright_apologia_pro_vita_sua.txt', 1: 'elke_de_rijcke_deffectiveness.txt', 2: 'place_babylonians-trans-jared-pearce_high-priest-prayer.txt', 3: 'place_david-baker_weed_granville-ohio-&-ontario-&-illinois-&-eerie-&-carolina-&-asia.txt', 
# 4: 'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt', 5: 'sappho_31_trans_julia_dubnoff_trans_chris_childers_trans_anne_carson_walt_whitman_woman_waits_for_me.txt', 
# 6: 'sharon_olds_the_knowing.txt', 7: 'ted_hughes_harvest_moon.txt', 8: 'veronique_bergen_trans_margento_wolves.txt', 9: 'w_h_auden_brussels_in_winter.txt'}

In [25]:

print (cos_sim(vector(lineX), vect_en[0]))

0.9516638825863778


In [25]:

print (cos_sim(vector(lineX), vect_en[1]))

0.9439541587545881


In [26]:

print (cos_sim(vector(lineX), vect_en[2]))

0.8931870013728541


In [27]:

print (cos_sim(vector(lineX), vect_en[3]))

0.896654653885662


In [28]:

print (cos_sim(vector(lineX), vect_en[4]))

0.9364661073953374


In [30]:

print (cos_sim(vector(lineX), vect_en[5]))

0.934262393351409


In [31]:

print (cos_sim(vector(lineX), vect_en[6]))

0.9332125354735691


In [32]:

print (cos_sim(vector(lineX), vect_en[7]))

0.8873837204024809


In [33]:

print (cos_sim(vector(lineX), vect_en[8]))

0.9379738899189959


In [29]:

print (cos_sim(vector(lineX), vect_en[9]))

0.9152198726911038


In [37]:

print (cos_sim(vector(line0), vect_en[1]))

thingless
thingless
antconc
0.9170378341806269


In [26]:

lineY = ['road', 'end', 'drunkenness','cobble', 'stone', 'co', 'bible', 'nowhere', 'concept', 'go', 'keep', 'on', 'eudore', 'pirmez', 'eu', 'do', 're', '©', 'door', 'pir', 'mez','like', 'past', 'clouds', 'loud', 'pt', 'ah', 'ptah', 'tight', 'passage', 'co', 'nc', 'err', 't', 'concert', 'ierta', 'vœrtex','above', 'street', 'stones', 'seen', 'world', 'starting', 'tilt', 'now', 'here', 'nowhere', 'where', 'were', 'going', 'visible', 'thing', 'thingless', 'we', 'came', 'world', 'thingless', 'we', 'leave', 'still', 'waves', 'bankrupt', 'pubs', 'bank', 'e', 'rupt','suicides', 'eye', 'attracted', 'downward', 'down', 'avenue', 'malou', 'soon', 'killed', 'assimilated', 'lift','eyes', 'swallowed', 'ore', 'well', 'orwell', 'sui', 'dna', 'saint', 'stone', 'st', 'antoine', 'anti', 'one', 'tone', 'le','coin', 'gni', 'ht', 'sel', 'sell', 'my','coins', 'antconc', 'passerby', 'mind', 'touch', 'tram', 'ramming', 'heartbeat', 'wreath', 'breath', 'exhaust', 'host', 'hostage', 'traffic', 'stage', 'clutter', 'gutter', 'utter', 'guttural', 'chocolate', 'store', 'flemish', 'gables', 'turkish', 'pizza', 'place', 'ace', 'room', 'doomed', 'con', 'connect', 'necked', 'echt', 'stuck', 'jam', 'headed', 'all', 'directions']
         
         

In [32]:

print(cos_sim(vector(lineY), vect_en[0]))

ierta
vœrtex
thingless
thingless
antconc
0.9526536123777063


In [33]:
print(cos_sim(vector(lineY), vect_en[1]))

ierta
vœrtex
thingless
thingless
antconc
0.9175443484963732


In [34]:

print(cos_sim(vector(lineY), vect_en[2]))

ierta
vœrtex
thingless
thingless
antconc
0.8717047642779274


In [39]:
print(cos_sim(vector(lineY), vect_en[3]))

ierta
vœrtex
thingless
thingless
antconc
0.9098063939329298


In [36]:
print(cos_sim(vector(lineY), vect_en[4]))

ierta
vœrtex
thingless
thingless
antconc
0.9331610618811681


In [37]:
print(cos_sim(vector(lineY), vect_en[5]))
print(cos_sim(vector(lineY), vect_en[6]))
print(cos_sim(vector(lineY), vect_en[7]))
print(cos_sim(vector(lineY), vect_en[8]))
print(cos_sim(vector(lineY), vect_en[9]))

ierta
vœrtex
thingless
thingless
antconc
0.8930866309552881
ierta
vœrtex
thingless
thingless
antconc
0.9096865229728374
ierta
vœrtex
thingless
thingless
antconc
0.9082715666657221
ierta
vœrtex
thingless
thingless
antconc
0.9352446146748088
ierta
vœrtex
thingless
thingless
antconc
0.9367645010963497


In [38]:
# For now X = 1 and Y = 0

In [24]:

lineZ = ['ever', 'you', 'every', 'e', 'ver', 'x', 'ile', 'exile', 'ou', 'gur', 'u', 'guru', 'r', 'age', 'rage', 'gnit', 'y', 'degnity', 'dignity', 'p', 'lace', 'place', 'f', 'at', 'fat', 'her', 'father', 'sc', 'y', 'the', 'scythe', 'scatter', 'fami', 'famine', 'family', 'at', 'pat', 'patience', 'temp', 'science', 'attempt', 's', 'id', 'e', 'side', 'ion', 'ignition', 'tube', 'youtube', 'walk', 'sidewalk', 'tm', 'extr', 'ex', 'extr', 'extra', 'bar', 'bare', 'barely', 'c', 'limb', 'climb', 'climax', 'ch', 'err', 'ub', 'ubu', 'cherub', 'err','terra', 'gura', 'esp', 'ritz', 'esprit', 'spritzer', 'espritzer', 'thé', 'uterus', 'uber', 'bus', 'limax', 'wom', 'woman', 'womb', 'limn', 'limes', 'sexless', 'bless', '©', 'crowd', 'rowdy']

In [25]:
print(cos_sim(vector(lineZ), vect_en[0]))
print(cos_sim(vector(lineZ), vect_en[1]))
print(cos_sim(vector(lineZ), vect_en[2]))
print(cos_sim(vector(lineZ), vect_en[3]))
print(cos_sim(vector(lineZ), vect_en[4]))
print(cos_sim(vector(lineZ), vect_en[5]))
print(cos_sim(vector(lineZ), vect_en[6]))
print(cos_sim(vector(lineZ), vect_en[7]))
print(cos_sim(vector(lineZ), vect_en[8]))
print(cos_sim(vector(lineZ), vect_en[9]))

degnity
espritzer
0.8958725143382601
degnity
espritzer
0.8684293637069912
degnity
espritzer
0.847580217998099
degnity
espritzer
0.8637697915046306
degnity
espritzer
0.8674895809499273
degnity
espritzer
0.856659886315774
degnity
espritzer
0.8548618540780187
degnity
espritzer
0.8613722166540756
degnity
espritzer
0.8970713571524133
degnity
espritzer
0.8830451515792974


In [24]:

lineZ1 = ['ever', 'you', 'every', 'e', 'ver', 'x', 'ile', 'exile', 'ou', 'gur', 'u', 'guru', '@', 'gura', 'r', 'age', 'rage', 'gnit', 'y', 'degnity', 'dignity', 'p', 'lace', 'place', 'f', 'at', 'fat', 'her', 'father', 'sc', 'y', 'the', 'scythe', 'scatter', 'fami', 'famine', 'family', 'at', 'pat', 'patience', 'temp', 'science', 'attempt', 's', 'id', 'e', 'side', 'ion', 'ignition', 'tube', 'youtube', 'walk', 'sidewalk', 'tm', 'extr', 'ex', 'extr', 'extra', 'bar', '@', 'bare', 'barely', 'euro', 'barley', '@', 'c', 'limb', 'climb', 'climax', 'ch', 'err', 'ub', 'ubu', 'cherub', 'err','terra', 'gura', 'esp', 'ritz', 'esprit', 'spritzer', 'espritzer', 'thé', 'uterus', 'uber', 'bus', 'limax', 'wom', 'woman', 'womb', 'limn', 'limes', 'sexless', 'bless', 'copyright', 'crowd', 'rowdy', 'en', 'co', 'un', 'ter', 'term', 'therm', 'terra', 'ra', 'is', 'on', 'unison', 'at', 'temp', 't', 'attempt', 'on', 'toll', 'ontology', 'st', 'ore', 'store','temple', 'euro', 'dollar', 'pound', 'pro', 'leprosy', 'touch', 'less', 'touchless', 'ga', 'gang', 'ng', 'b', 'bang', 'gangbang', 'angst', 'st', 'store', 'orgy', 'euro', 'pound', 'sy', 'gy', 'syzygy', 'Z', 'integer', 'number','set', 'dollar', 'Zy', 'G', 'G-number', 'Grahams-number', 'gaga', 'st', 'stumble', 'humble', 'h', 'registered-mark', 'neck', 'O', 'logy', 'gagalogy', 'gynecology', 'of', 'traf', 'raf', 'registered-mark', 'fic', 'fict', 'beatitude', 'beat', 'eat', 'it', 'traffic', 'finitude', 'copyright', 'registered-mark', 'registered-mark', 'euro', 'trade-mark', 'copyright', 'dollar', 'euro','beatitudes', 'finitudes', 'ud', 'ude', 'stare', 'a', 're', 'stud', 'angels', 'else', 'els', 'euro', 'elsewhere', 'we', 'are', 'where', 'h', 'a', 'o', 'or']

In [25]:

print(0, cos_sim(vector(lineZ1), vect_en[0]))
print(1, cos_sim(vector(lineZ1), vect_en[1]))
print(2, cos_sim(vector(lineZ1), vect_en[2]))
print(3, cos_sim(vector(lineZ1), vect_en[3]))
print(4, cos_sim(vector(lineZ1), vect_en[4]))
print(5, cos_sim(vector(lineZ1), vect_en[5]))
print(6, cos_sim(vector(lineZ1), vect_en[6]))
print(7, cos_sim(vector(lineZ1), vect_en[7]))
print(8, cos_sim(vector(lineZ1), vect_en[8]))
print(9, cos_sim(vector(lineZ1), vect_en[9]))

degnity
espritzer
Z
Zy
G
G-number
Grahams-number
registered-mark
O
gagalogy
registered-mark
registered-mark
registered-mark
trade-mark
finitudes
0.8975539035830501
degnity
espritzer
Z
Zy
G
G-number
Grahams-number
registered-mark
O
gagalogy
registered-mark
registered-mark
registered-mark
trade-mark
finitudes
0.8646927981626514
degnity
espritzer
Z
Zy
G
G-number
Grahams-number
registered-mark
O
gagalogy
registered-mark
registered-mark
registered-mark
trade-mark
finitudes
0.843999896789601
degnity
espritzer
Z
Zy
G
G-number
Grahams-number
registered-mark
O
gagalogy
registered-mark
registered-mark
registered-mark
trade-mark
finitudes
0.8604117336602712
degnity
espritzer
Z
Zy
G
G-number
Grahams-number
registered-mark
O
gagalogy
registered-mark
registered-mark
registered-mark
trade-mark
finitudes
0.8679394761445945
degnity
espritzer
Z
Zy
G
G-number
Grahams-number
registered-mark
O
gagalogy
registered-mark
registered-mark
registered-mark
trade-mark
finitudes
0.8489275305176048
degnity
espritzer

In [27]:
LineZ1 = ['ever', 'you', 'every', 'e', 'ver', 'x', 'ile', 'exile', 'ou', 'gur', 'u', 'guru', '@', 'gura', 'r', 'age', 'rage', 'gnit', 'y', 'degnity', 'dignity', 'p', 'lace', 'place', 'f', 'at', 'fat', 'her', 'father', 'sc', 'y', 'the', 'scythe', 'scatter', 'fami', 'famine', 'family', 'at', 'pat', 'patience', 'temp', 'science', 'attempt', 's', 'id', 'e', 'side', 'ion', 'ignition', 'tube', 'youtube', 'walk', 'sidewalk', 'tm', 'extr', 'ex', 'extr', 'extra', 'bar', '@', 'bare', 'barely', 'euro', 'barley', '@', 'c', 'limb', 'climb', 'climax', 'ch', 'err', 'ub', 'ubu', 'cherub', 'err','terra', 'gura', 'esp', 'ritz', 'esprit', 'spritzer', 'espritzer', 'thé', 'uterus', 'uber', 'bus', 'limax', 'wom', 'woman', 'womb', 'limn', 'limes', 'sexless', 'bless', 'copyright', 'crowd', 'rowdy', 'en', 'co', 'un', 'ter', 'term', 'therm', 'terra', 'ra', 'is', 'on', 'unison', 'at', 'temp', 't', 'attempt', 'on', 'toll', 'ontology', 'st', 'ore', 'store','temple', 'euro', 'dollar', 'pound', 'pro', 'leprosy', 'touch', 'less', 'touchless', 'ga', 'gang', 'ng', 'b', 'bang', 'gangbang', 'angst', 'st', 'store', 'orgy', 'euro', 'pound', 'sy', 'gy', 'syzygy', 'Z', 'integer', 'number','set', 'dollar', 'Zy', 'G', 'G-number', 'Grahams-number', 'Graham', 'number', 'gaga', 'st', 'stumble', 'humble', 'h', '®', 'neck', 'O', 'logy', 'gagalogy', 'gynecology', 'of', 'traf', 'raf', '®', 'fic', 'fict', 'beatitude', 'beat', 'eat', 'it', 'traffic', 'finitude', 'copyright', '®', '®', 'euro', 'tm', 'copyright', 'dollar', 'euro','beatitudes', 'finitudes', 'ud', 'ude', 'stare', 'a', 're', 'stud', 'angels', 'else', 'els', 'euro', 'elsewhere', 'we', 'are', 'where', 'h', 'a', 'o', 'or']

In [29]:
lineZ1 = ['ever', 'you', 'every', 'e', 'ver', 'x', 'ile', 'exile', 'ou', 'gur', 'u', 'guru', '@', 'gura', 'r', 'age', 'rage', 'gnit', 'y', 'degnity', 'dignity', 'p', 'lace', 'place', 'f', 'at', 'fat', 'her', 'father', 'sc', 'y', 'the', 'scythe', 'scatter', 'fami', 'famine', 'family', 'at', 'pat', 'patience', 'temp', 'science', 'attempt', 's', 'id', 'e', 'side', 'ion', 'ignition', 'tube', 'youtube', 'walk', 'sidewalk', 'tm', 'extr', 'ex', 'extr', 'extra', 'bar', '@', 'bare', 'barely', 'euro', 'barley', '@', 'c', 'limb', 'climb', 'climax', 'ch', 'err', 'ub', 'ubu', 'cherub', 'err','terra', 'gura', 'esp', 'ritz', 'esprit', 'spritzer', 'espritzer', 'thé', 'uterus', 'uber', 'bus', 'limax', 'wom', 'woman', 'womb', 'limn', 'limes', 'sexless', 'bless', 'copyright', 'crowd', 'rowdy', 'en', 'co', 'un', 'ter', 'term', 'therm', 'terra', 'ra', 'is', 'on', 'unison', 'at', 'temp', 't', 'attempt', 'on', 'toll', 'ontology', 'st', 'ore', 'store','temple', 'euro', 'dollar', 'pound', 'pro', 'leprosy', 'touch', 'less', 'touchless', 'ga', 'gang', 'ng', 'b', 'bang', 'gangbang', 'angst', 'st', 'store', 'orgy', 'euro', 'pound', 'sy', 'gy', 'syzygy', 'z', 'integer', 'number','set', 'dollar', 'zy', 'g', 'g-number', 'grahams-number', 'graham', 'number', 'gaga', 'st', 'stumble', 'humble', 'h', '®', 'neck', 'o', 'logy', 'gagalogy', 'gynecology', 'of', 'traf', 'raf', '®', 'fic', 'fict', 'beatitude', 'beat', 'eat', 'it', 'traffic', 'finitude', 'copyright', '®', '®', 'euro', 'tm', 'copyright', 'dollar', 'euro','beatitudes', 'finitudes', 'ud', 'ude', 'stare', 'a', 're', 'stud', 'angels', 'else', 'els', 'euro', 'elsewhere', 'we', 'are', 'where', 'h', 'a', 'o', 'or']

In [30]:
print(0, cos_sim(vector(lineZ1), vect_en[0]))
print(1, cos_sim(vector(lineZ1), vect_en[1]))
print(2, cos_sim(vector(lineZ1), vect_en[2]))
print(3, cos_sim(vector(lineZ1), vect_en[3]))
print(4, cos_sim(vector(lineZ1), vect_en[4]))
print(5, cos_sim(vector(lineZ1), vect_en[5]))
print(6, cos_sim(vector(lineZ1), vect_en[6]))
print(7, cos_sim(vector(lineZ1), vect_en[7]))
print(8, cos_sim(vector(lineZ1), vect_en[8]))
print(9, cos_sim(vector(lineZ1), vect_en[9]))

degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
0 0.8959187603026576
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
1 0.8646043837913708
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
2 0.8439660383840966
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
3 0.8573924497713336
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
4 0.8676085016956236
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
5 0.8487293464381855
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
6 0.846468839649542
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
7 0.8575238298258451
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
8 0.8934175394892132
degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
9 0.877797386084055


In [42]:

lineZ2 = ['ever', 'you', 'every', 'e', 'ver', 'x', 'ile', 'exile', 'ou', 'gur', 'u', 'guru', '@', 'gura', 'r', 'age', 'rage', 'gnit', 'y', 'degnity', 'dignity', 'wave', 'wavy', 'wavre', 'p', 'lace', 'place', 'f', 'at', 'fat', 'her', 'father', 'sc', 'y', 'the', 'scythe', 'scatter', 'fami', 'famine', 'family', 'at', 'pat', 'patience', 'temp', 'science', 'attempt', 's', 'id', 'e', 'side', 'ion', 'ignition', 'tube', 'youtube', 'walk', 'sidewalk', 'tm', 'extr', 'ex', 'extr', 'extra', 'bar', '@', 'bare', 'barely', 'euro', 'barley', '@', 'c', 'limb', 'climb', 'climax', 'ch', 'err', 'ub', 'ubu', 'cherub', 'err','terra', 'gura', 'esp', 'ritz', 'esprit', 'spritzer', 'espritzer', 'thé', 'uterus', 'uber', 'bus', 'limax', 'wom', 'woman', 'womb', 'limn', 'limes', 'sexless', 'bless', 'copyright', 'crowd', 'rowdy', 'en', 'co', 'un', 'ter', 'term', 'therm', 'terra', 'ra', 'is', 'on', 'unison', 'at', 'temp', 't', 'attempt', 'on', 'toll', 'ontology', 'st', 'ore', 'store','temple', 'euro', 'dollar', 'pound', 'pro', 'leprosy', 'touch', 'less', 'touchless', 'ga', 'gang', 'ng', 'b', 'bang', 'gangbang', 'angst', 'st', 'store', 'orgy', 'euro', 'pound', 'sy', 'gy', 'syzygy', 'z', 'integer', 'number','set', 'dollar', 'zy', 'g', 'g-number', 'grahams-number', 'graham', 'number', 'gaga', 'st', 'stumble', 'humble', 'h', '®', 'neck', 'o', 'logy', 'gagalogy', 'gynecology', 'of', 'traf', 'raf', '®', 'fic', 'fict', 'beatitude', 'beat', 'eat', 'it', 'traffic', 'finitude', 'copyright', '®', '®', 'euro', 'tm', 'copyright', 'dollar', 'euro','beatitudes', 'finitudes', 'ud', 'ude', 'stare', 'a', 're', 'stud', 'angels', 'else', 'els', 'euro', 'elsewhere', 'we', 'are', 'where', 'h', 'a', 'o', 'or', 'food', 'look', 'work']

In [43]:

print(2, cos_sim(vector(lineZ2), vect_en[2]))

degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
2 0.8483009326847178


In [50]:


lineZ3 = ['ever', 'you', 'every', 'e', 'ver', 'x', 'ile', 'exile', 'ou', 'gur', 'u', 'guru', '@', 'gura', 'r', 'age', 'rage', 'gnit', 'y', 'degnity', 'dignity', 'wave', 'wavy', 'wavre', 'p', 'lace', 'place', 'f', 'at', 'fat', 'her', 'father', 'sc', 'y', 'the', 'scythe', 'scatter', 'fami', 'famine', 'family', 'at', 'pat', 'patience', 'temp', 'science', 'attempt', 's', 'id', 'e', 'side', 'ion', 'ignition', 'tube', 'youtube', 'walk', 'sidewalk', 'tm', 'extr', 'ex', 'extr', 'extra', 'bar', '@', 'bare', 'barely', 'euro', 'barley', '@', 'c', 'limb', 'climb', 'climax', 'ch', 'err', 'ub', 'ubu', 'cherub', 'err','terra', 'gura', 'esp', 'ritz', 'esprit', 'spritzer', 'espritzer', 'thé', 'uterus', 'uber', 'bus', 'limax', 'wom', 'woman', 'womb', 'limn', 'limes', 'sexless', 'bless', 'copyright', 'crowd', 'rowdy', 'en', 'co', 'un', 'ter', 'term', 'therm', 'terra', 'ra', 'is', 'on', 'unison', 'at', 'temp', 't', 'attempt', 'on', 'toll', 'ontology', 'st', 'ore', 'store','temple', 'euro', 'dollar', 'pound', 'pro', 'leprosy', 'touch', 'less', 'touchless', 'ga', 'gang', 'ng', 'b', 'bang', 'gangbang', 'angst', 'st', 'store', 'orgy', 'euro', 'pound', 'sy', 'gy', 'syzygy', 'z', 'integer', 'number','set', 'dollar', 'zy', 'g', 'g-number', 'grahams-number', 'graham', 'number', 'gaga', 'st', 'stumble', 'humble', 'h', '®', 'neck', 'o', 'logy', 'gagalogy', 'gynecology', 'of', 'traf', 'raf', '®', 'fic', 'fict', 'beatitude', 'beat', 'eat', 'it', 'traffic', 'finitude', 'copyright', '®', '®', 'euro', 'tm', 'copyright', 'dollar', 'euro','beatitudes', 'finitudes', 'ud', 'ude', 'stare', 'a', 're', 'stud', 'angels', 'else', 'els', 'euro', 'elsewhere', 'we', 'are', 'where', 'h', 'a', 'o', 'or', 'food', 'look', 'work', 'mother', 'sisters', 'other', 'h', 'sys', 'system']

In [51]:

print(2, cos_sim(vector(lineZ3), vect_en[2]))

degnity
espritzer
g-number
grahams-number
gagalogy
finitudes
2 0.852218869490196
