In [None]:
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm
from gensim.models import KeyedVectors
import re
from itertools import permutations
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim

In [None]:
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1
        
epoch_logger = EpochLogger()

### Assigment 4

**Submission deadlines**:

* get at least 4 points by Tuesday, 12.05.2022
* remaining points: last lab session before or on Tuesday, 19.05.2022

**Points:** Aim to get 12 out of 15+ possible points

All needed data files are on Drive: <https://drive.google.com/drive/folders/1HaMbhzaBxxNa_z_QJXSDCbv5VddmhVVZ?usp=sharing> (or will be soon :) )

# Task 1 (5 points)

Implement simplified word2vec with negative sampling from scratch (using pure numpy). Assume that in the training data objects and contexts are given explicitly, one pair per line, and objects are on the left. The result of the training should be object vectors. Please, write them to a file using *natural* text format, ie

<pre>
K N
word1 x1_1 x1_2 ... x1_N 
word2 x2_1 x2_2 ... x2_N
...
wordK xK_1 xK_2 ... xk_N
</pre>

Use the loss from Slide 3 in Lecture NLP.2, compute the gradient manually. You can use some gradient clipping, or regularisation. 

**Remark**: the data is specially prepared to make the learning process easier. 
Present vectors using the code below. In this task we define success as 'obtaining a result which looks definitely not random'


Usuwać pozytywny indeks z indeksów w negative samplingu. lr=0.1 bez zmniejszania lr. Sprawdzać loss. neg=3, u^3/4, dim=20. Można porównać gradienty z pytorchem.

In [None]:
def create_words_dicts(object_context_filename="task1_objects_contexts_polish.txt"):
    target_to_id = dict()
    context_to_id = dict()
    context_counts = defaultdict(int)
    target_contex_pairs = []
    total_contex_counts = 0
    with open(object_context_filename, "r", encoding="utf8") as object_context_file:
        i_target = 0
        i_context = 0
        for line in tqdm(object_context_file):
            line = line.strip()
            target, context = line.split()
            
            if target not in target_to_id:
                target_to_id[target] = i_target
                i_target += 1
                
            if context not in context_to_id:
                context_to_id[context] = i_context
                i_context += 1
            
            context_counts[context] += 1
            total_contex_counts += 1
            
            target_contex_pairs.append((target_to_id[target], context_to_id[context]))
    return target_to_id, context_to_id, context_counts, total_contex_counts, target_contex_pairs

def create_unigram_distribution(context_counts, total_contex_counts, context_to_id, power=0.75):
    unigram_distribution = dict()
    for k, v in context_counts.items():
        unigram_distribution[k] = (v / total_contex_counts) ** power
    Z = sum(unigram_distribution.values())
    for k, v in unigram_distribution.items():
        unigram_distribution[k] = v /Z
    index_to_prob = dict()
    for k, v in unigram_distribution.items():
        index_to_prob[context_to_id[k]] = v
    return np.array(list(index_to_prob.keys())), np.array(list(index_to_prob.values()))

In [None]:
target_to_id, context_to_id, context_counts, total_contex_counts, target_contex_pairs = create_words_dicts(
    object_context_filename="task1_objects_contexts_polish.txt")
unigram_indices, unigram_probs = create_unigram_distribution(
    context_counts, total_contex_counts, context_to_id, power=0.75)

0it [00:00, ?it/s]

In [None]:
class Word2Vec():
    def __init__(self, target_to_id, context_to_id, unigram_distribution, target_contex_pairs,
                 object_context_filename="task1_objects_contexts_polish.txt"):
        self.target_to_id = target_to_id
        self.context_to_id = context_to_id
        self.n_targets = len(target_to_id)
        self.n_context = len(context_to_id)
        self.unigram_indices = unigram_distribution[0]
        self.unigram_probs = unigram_distribution[1]
        self.cumulative = [self.unigram_probs[0]]
        for pvalue in self.unigram_probs[1:]:
            self.cumulative.append(self.cumulative[-1] + pvalue)
        self.no_pvalues = len(self.unigram_probs)
        self.target_contex_pairs = target_contex_pairs
        self.object_context_filename = object_context_filename

    def initialize_vectors(self, dim=312):
        self.dim = dim
        self.V = np.random.uniform(-0.5, 0.5, (self.n_targets, dim)) / dim
        self.U = np.random.uniform(-0.5, 0.5, (self.n_context, dim)) / dim
        
    def read_weights_from_files(self, target_filename, context_filename):
        pass
    
    def save_weights_to_files(self, target_filename, contex_filename):
        with open(target_filename, "w", encoding="utf8")as target_file:
            target_file.write(f"{self.n_targets} {self.dim}\n")
            for word, word_id in self.target_to_id.items():
                target_file.write(word)
                for i in self.V[word_id]:
                    target_file.write(" " + str(i))
                target_file.write("\n")
        with open(contex_filename, "w", encoding="utf8")as context_file:
            context_file.write(f"{self.n_context} {self.dim}\n")
            for word, word_id in self.context_to_id.items():
                context_file.write(word)
                for i in self.U[word_id]:
                    context_file.write(" " + str(i))
                context_file.write("\n")
    
    def sigmoid(self, v):
        return 1/(1+np.exp(-v))
    
#     def choice(self):
#         options = self.unigram_indices
#         probs = self.unigram_probs
#         x = np.random.rand()
#         cum = 0
#         for i,p in enumerate(probs):
#             cum += p
#             if x < cum:
#                 break
#         return options[i]
    
    def choice(self):
        x = np.random.rand()
        l = 0
        r = self.no_pvalues
        while l < r:
            mid = (l + r) // 2
            if self.cumulative[mid] < x:
                l = mid
            elif self.cumulative[mid] == x or (mid >= 1 and self.cumulative[mid - 1] < x):
                return mid
            else:
                r = mid
        return l
    
#     def multidimensional_shifting(self, num_samples, sample_size):
#         # replicate probabilities as many times as `num_samples`
#         elements = self.unigram_indices
#         probabilities = self.unigram_probs
#         replicated_probabilities = np.tile(probabilities, (num_samples, 1))    # get random shifting numbers & scale them correctly
#         random_shifts = np.random.random(replicated_probabilities.shape)
#         random_shifts /= random_shifts.sum(axis=1)[:, np.newaxis]    # shift by numbers & find largest (by finding the smallest of the negative)
#         shifted_probabilities = random_shifts - replicated_probabilities
#         return np.argpartition(shifted_probabilities, sample_size, axis=1)[:, :sample_size]

    
    def train(self, neg_samples=10, lr=0.003, epochs=6, const_lr=True):
        n_of_pairs = len(self.target_contex_pairs)
        if not const_lr:
            lr_decrease = lr / (n_of_pairs * epochs)
        else:
            lr_decrease = 0
        for i_epoch in tqdm(range(epochs)):
            order = np.arange(n_of_pairs)
            np.random.shuffle(order)
            sampling_frequency = 10000
            for i, pair_idx in enumerate(tqdm(order, leave=False)):
                target_id, context_id = self.target_contex_pairs[pair_idx]
                
                if i % sampling_frequency == 0:
                    many_negative_ids = np.random.choice(self.unigram_indices, size=(sampling_frequency, neg_samples),
                                                         p=self.unigram_probs)
                negative_ids = many_negative_ids[i%sampling_frequency]
#                 negative_ids = np.random.choice(self.unigram_indices, size=neg_samples, p=self.unigram_probs)
#                 negative_ids = [self.choice() for i in range(neg_samples)]
                v = self.V[target_id]
                pos_u = self.U[context_id]
                neg_u = self.U[negative_ids]
                
                grad_v = (self.sigmoid(pos_u @ v) - 1) * pos_u + (
                    self.sigmoid(neg_u @ v).reshape(-1, 1) * neg_u).sum(axis=0)
                grad_pos_u = (self.sigmoid(pos_u @ v) - 1) * v
#                 grad_neg_u = self.sigmoid(neg_u @ v).reshape(-1, 1) @ v.reshape(1, -1)
                grad_neg_u = np.outer(self.sigmoid(neg_u @ v), v)
                
                self.V[target_id] -= lr * grad_v
                self.U[context_id] -= lr * grad_pos_u
                self.U[negative_ids] -= lr * grad_neg_u
                lr -= lr_decrease
        

In [None]:
def multidimensional_shifting(num_samples, sample_size, elements, probabilities):
    # replicate probabilities as many times as `num_samples`
    replicated_probabilities = np.tile(probabilities, (num_samples, 1))    # get random shifting numbers & scale them correctly
    random_shifts = np.random.random(replicated_probabilities.shape)
    random_shifts /= random_shifts.sum(axis=1)[:, np.newaxis]    # shift by numbers & find largest (by finding the smallest of the negative)
    shifted_probabilities = random_shifts - replicated_probabilities
    return np.argpartition(shifted_probabilities, sample_size, axis=1)[:, :sample_size]

In [None]:
%timeit np.random.choice(unigram_indices, size=(100000, 5), p=unigram_probs)

88.2 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
np.random.choice(unigram_indices, size=(10, 5), p=unigram_probs)[0]

array([24301, 42289, 62610, 69562, 98935])

In [None]:
w2v = Word2Vec(target_to_id, context_to_id, (unigram_indices, unigram_probs), target_contex_pairs)

In [None]:
w2v.initialize_vectors(dim=128)
w2v.train(neg_samples=3, lr=0.003, epochs=6)

In [None]:
w2v = Word2Vec(target_to_id, context_to_id, (unigram_indices, unigram_probs), target_contex_pairs)
w2v.initialize_vectors(dim=128)
w2v.train(neg_samples=5, lr=0.003, epochs=6)

In [None]:
w2v = Word2Vec(target_to_id, context_to_id, (unigram_indices, unigram_probs), target_contex_pairs)
w2v.initialize_vectors(dim=128)
w2v.train(neg_samples=5, lr=0.003, epochs=6)

In [None]:
w2v = Word2Vec(target_to_id, context_to_id, (unigram_indices, unigram_probs), target_contex_pairs)
w2v.initialize_vectors(dim=128)
w2v.train(neg_samples=20, lr=0.003, epochs=6)

In [None]:
w2v = Word2Vec(target_to_id, context_to_id, (unigram_indices, unigram_probs), target_contex_pairs)
w2v.initialize_vectors(dim=20)
w2v.train(neg_samples=3, lr=0.1, epochs=6)

In [None]:
w2v = Word2Vec(target_to_id, context_to_id, (unigram_indices, unigram_probs), target_contex_pairs)
w2v.initialize_vectors(dim=20)
w2v.train(neg_samples=5, lr=0.1, epochs=6, const_lr=True)

In [None]:
w2v.save_weights_to_files('task1_w2v_vectors_neg5_dim20_lr01.txt',
                          'task1_w2v_contexts_neg5_dim20_lr01.txt')

In [None]:
task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors_neg3_dim20_constlr01.txt', binary=False)
example_english_words = ['dog', 'dragon', 'love', 'bicycle', 'marathon', 'logic', 'butterfly']  # replace, or add your own examples
example_polish_words = ['pies', 'smok', 'miłość', 'rower', 'maraton', 'logika', 'motyl']

example_words = example_polish_words

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in task1_wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: pies
    krowa 0.9014444947242737
    facet 0.8701364398002625
    liliputka 0.8605725765228271
    osesek 0.8509775400161743
    kot 0.848791241645813
    nordyk 0.8469008207321167
    panna 0.8351026773452759
    kura 0.826441764831543
    predator 0.8199484944343567
    wąż 0.8166723251342773

WORD: smok
    bogini 0.8793821930885315
    topielica 0.8707863092422485
    mrówkojad 0.8640605211257935
    krasnolud 0.8577755689620972
    straszydło 0.8460540771484375
    stwór 0.8394477963447571
    niedźwiedzica 0.8370437026023865
    afrodyta 0.836599588394165
    pitekantrop 0.8360117673873901
    toreador 0.8353720903396606

WORD: miłość
    twórczość 0.8362767100334167
    wiara 0.8238243460655212
    człowieczeństwo 0.8071744441986084
    uczucie 0.7987077236175537
    tęsknota 0.7985913753509521
    miłosierdzie 0.7920595407485962
    synostwo 0.7905794978141785
    uskrzydlenie 0.7892605066299438
    sen 0.7874357104301453
    konanie 0.7870968580245972

WORD: rower
    s

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim
model = Word2Vec(corpus_file="task1_objects_contexts_polish.txt", vector_size=100, window=5, min_count=1,
                 workers=4, callbacks=[EpochLogger()])
model.save("task1_gensim.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("task1_gensim.model")

In [None]:
# task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors_neg20_dim128_numpy_choice.txt', binary=False)
example_english_words = ['dog', 'dragon', 'love', 'bicycle', 'marathon', 'logic', 'butterfly']  # replace, or add your own examples
example_polish_words = ['pies', 'smok', 'miłość', 'rower', 'maraton', 'logika', 'motyl']

example_words = example_polish_words

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in model.wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: pies
    koń 0.9845697283744812
    dziewczyna 0.9662677645683289
    zwierzę 0.9631377458572388
    chłopiec 0.9611811637878418
    kot 0.9588338732719421
    ptak 0.9534725546836853
    chłopak 0.9449874758720398
    facet 0.944753885269165
    dziewczynka 0.9417502880096436
    chłopek 0.9391786456108093

WORD: smok
    baba 0.9924331307411194
    krowa 0.989021360874176
    szczur 0.988284707069397
    mucha 0.9871571063995361
    pszczoła 0.9853836894035339
    kota 0.9851759672164917
    kura 0.9841569662094116
    kamyk 0.9832960963249207
    słoń 0.9817646741867065
    niedźwiedź 0.981716513633728

WORD: miłość
    wiara 0.9597357511520386
    wyobraźnia 0.9333335757255554
    duch 0.932645857334137
    nadzieja 0.932121753692627
    radość 0.9288252592086792
    marzenie 0.9262442588806152
    umysł 0.9238408207893372
    szczęście 0.9171186685562134
    nienawiść 0.9145898818969727
    dusza 0.9100762605667114

WORD: rower
    telewizor 0.9882622957229614
    wózek 0.98

# Task 2 (4 points)

Your task is to train the embeddings for Simple Wikipedia titles, using gensim library. As the example below shows, training is really simple:

```python
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
```
*sentences* can be a list of list of tokens, you can also use *gensim.models.word2vec.LineSentence(source)* to create restartable iterator from file. At first, use [this file] containing such pairs of titles, that one article links to another.

We say that two titles are *related* if they both contain a word (or a word bigram) which is not very popular (it occurs only in several titles). Make this definition more precise, and create the corpora which contains pairs of related titles. Make a mixture of the original corpora, and the new one, then train title vectors again.

Compare these two approaches using similar code to the code from Task 1.

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim
# sentences = gensim.models.word2vec.LineSentence("simple.wiki.links.txt")
# model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
model = Word2Vec(corpus_file="simple.wiki.links.txt", vector_size=100, window=5, min_count=1, workers=4) 
model.save("task2_links_only.model")

In [None]:
task2_wv = Word2Vec.load("task2_links_only.model")

In [None]:
example_titles = ['statistics', 'harry_potter_and_the_order_of_the_phoenix', 'western_philosophy', 'crops', 'acceleration']

example_words = example_titles

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in task2_wv.wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: statistics
    machine 0.9958315491676331
    idea 0.9952155947685242
    vaccine 0.9942779541015625
    health 0.9935333728790283
    flight 0.993130087852478
    clock 0.9928908348083496
    communication 0.9923535585403442
    gospel 0.9921176433563232
    theory 0.991838812828064
    meteor 0.9916765093803406

WORD: harry_potter_and_the_order_of_the_phoenix
    louisiana_blues 0.9936421513557434
    parent 0.9931883811950684
    category:british_military_people 0.9930567741394043
    shirt 0.9930347800254822
    albus_dumbledore 0.9928532242774963
    low 0.9928193092346191
    a_minor 0.9927289485931396
    nine 0.9926779866218567
    matrix_function 0.9926573634147644
    interpretation 0.9926302433013916

WORD: western_philosophy
    unification_church 0.9958949685096741
    history_of_indonesia 0.995868980884552
    310_bc 0.9956936240196228
    khmer_empire 0.9955422282218933
    0s 0.9955031275749207
    ablai_khan 0.9954162836074829
    template:european_diasporas 0.99

In [None]:
import re
from itertools import permutations
from tqdm.notebook import tqdm
from collections import defaultdict
import numpy as np
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim

def get_unique_titles(filename):
    titles = set()
    with open(filename, "r", encoding="utf8") as file:
        for line in tqdm(file):
            line = line.strip()
            many_titles = line.split()
            for title in many_titles:
                titles.add(title)
    return list(titles)

# def get_unigrams(filename):
#     unique_unigrams = set()
#     with open(filename, "r", encoding="utf8") as file:
#         for line in tqdm(file):
#             line = line.strip()
#             unigrams = re.split(" |_|:", line)
#             for unigram in unigrams:
#                 unique_unigrams.add(unigram)
#     return unique_unigrams

def count_unigrams(titles):
    unigram_reverse_index = defaultdict(list)
    for i, title in enumerate(tqdm(titles)):
        unigrams = re.split("_|:", title)
        for unigram in unigrams:
            unigram_reverse_index[unigram].append(i)
    return unigram_reverse_index

def get_unigrams_below_threshold(unigram_reverse_index, threshold=5):
    unigram_reverse_index_thresholded = defaultdict(list)
    for unigram, title_ids in tqdm(unigram_reverse_index.items()):
        n_of_ids = len(set(title_ids))
        if n_of_ids > 1 and n_of_ids <= threshold:
            unigram_reverse_index_thresholded[unigram] = list(set(title_ids))
    return unigram_reverse_index_thresholded

def create_pairs(titles, unigram_reverse_index_thresholded, filename):
    with open(filename, "w", encoding="utf8") as file:
        for unigram, title_ids in tqdm(unigram_reverse_index_thresholded.items()):
            for id1, id2 in permutations(title_ids, 2):
                file.write(titles[id1] + " " + titles[id2] + "\n")
# combinations
# def write_unigram_counts_to_file(unigram_counts, filename):
#     with open(filename, "w", encoding="utf8") as file:
#         for unigram, count in tqdm(unigram_counts.items()):
#             file.write(unigram + " " + str(count) + "\n")
            
def create_mixed_corpus(corpus1_filename, corpus2_filename):
    corpus1 = [line.strip().split() for line in open(corpus1_filename, "r", encoding="utf8")]
    corpus2 = [line.strip().split() for line in open(corpus2_filename, "r", encoding="utf8")]
    corpus1.extend(corpus2)
    np.random.shuffle(corpus1)
    return corpus1

Creating unigram pairs file

In [None]:
titles = get_unique_titles("simple.wiki.links.txt")
unigram_reverse_index = count_unigrams(titles)
unigram_reverse_index_thresholded = get_unigrams_below_threshold(unigram_reverse_index, threshold=5)
create_pairs(titles, unigram_reverse_index_thresholded, "task2_unigram_links_thr5.txt")

In [None]:
titles = get_unique_titles("simple.wiki.links.txt")
unigram_reverse_index = count_unigrams(titles)
unigram_reverse_index_thresholded = get_unigrams_below_threshold(unigram_reverse_index, threshold=10)
create_pairs(titles, unigram_reverse_index_thresholded, "task2_unigram_links_thr10.txt")

In [None]:
titles = get_unique_titles("simple.wiki.links.txt")
unigram_reverse_index = count_unigrams(titles)
unigram_reverse_index_thresholded = get_unigrams_below_threshold(unigram_reverse_index, threshold=15)
create_pairs(titles, unigram_reverse_index_thresholded, "task2_unigram_links_thr15.txt")

In [None]:
titles = get_unique_titles("simple.wiki.links.txt")
unigram_reverse_index = count_unigrams(titles)
unigram_reverse_index_thresholded = get_unigrams_below_threshold(unigram_reverse_index, threshold=20)
create_pairs(titles, unigram_reverse_index_thresholded, "task2_unigram_links_thr20.txt")

Creating corporas and training

In [None]:
sentences = create_mixed_corpus("simple.wiki.links.txt", "task2_unigram_links_thr5.txt")

In [None]:
epoch_logger = EpochLogger()
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, callbacks=[epoch_logger])
model.save("task2_mixed_thr5.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("task2_mixed_thr5.model")

In [None]:
example_titles = ['statistics', 'harry_potter_and_the_order_of_the_phoenix', 'western_philosophy', 'crops', 'acceleration']

example_words = example_titles

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in model.wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: statistics
    vaccine 0.9946828484535217
    perihelion 0.9933329224586487
    idea 0.9931748509407043
    astrology 0.9928882122039795
    camera 0.9925744533538818
    communication 0.9924458265304565
    1004 0.9924340844154358
    night 0.9920859932899475
    37 0.9920130372047424
    47_bc 0.9919556379318237

WORD: harry_potter_and_the_order_of_the_phoenix
    harry_potter_and_the_half-blood_prince 0.9963700771331787
    red_river 0.996210515499115
    town_rights 0.9962059259414673
    template:politics_of_iran 0.9960891604423523
    template:ethnic_groups_of_russia 0.9960600137710571
    bass 0.9959389567375183
    henry_king 0.9958997368812561
    category:brazilian_lawyers 0.995841383934021
    template:campaignbox_korean_war 0.9958338141441345
    düsseldorf_school_of_painting 0.9958128929138184

WORD: western_philosophy
    merchant_vessel 0.9962515830993652
    holguín_province 0.9954962730407715
    joongang_ilbo 0.9951223731040955
    north_sulawesi 0.9949187040328

In [None]:
sentences = create_mixed_corpus("simple.wiki.links.txt", "task2_unigram_links_thr20.txt")
epoch_logger = EpochLogger()
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, callbacks=[epoch_logger])
model.save("task2_mixed_thr20.model")

Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end


In [None]:
model = Word2Vec.load("task2_links_only.model")

In [None]:
example_titles = ['statistics', 'harry_potter_and_the_order_of_the_phoenix', 'western_philosophy', 'crops', 'acceleration']

example_words = example_titles

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in model.wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: statistics
    communication 0.9970863461494446
    night 0.9969813227653503
    billion 0.9969518780708313
    vaccine 0.9968886375427246
    photography 0.9963081479072571
    man 0.9961920380592346
    translation 0.9960965514183044
    poem 0.9960731863975525
    995 0.9960111975669861
    antioch 0.9958942532539368

WORD: harry_potter_and_the_order_of_the_phoenix
    bullying 0.9964322447776794
    lie 0.99638432264328
    doctor_who_companions 0.9960330724716187
    eve 0.9958974719047546
    recipe 0.9956963658332825
    changsha 0.9956899285316467
    rubik's_cube 0.9956322312355042
    hello 0.9955694079399109
    duck_family_(disney) 0.9955654740333557
    harry_potter_and_the_goblet_of_fire 0.9955259561538696

WORD: western_philosophy
    hamad_bin_khalifa_al_thani 0.9969141483306885
    jan_peter_balkenende 0.9963982105255127
    tsering_woeser 0.9963878989219666
    horse_racing 0.9963662028312683
    pakistan_international_airlines 0.9961951971054077
    kyshtym_dis

# Task 3 (4 points)

Suppose that we have two languages: Upper and Lower. This is an example Upper sentence:

<pre>
THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG.
</pre>

And this is its translation into Lower:

<pre>
the quick brown fox jumps over the lazy dog
</pre>

You have two corpora for these languages (with different sentences). Your task is to train word embedings for both languages together, so as to make embeddings of the words which are its translations as close as possible. But unfortunately, you have the budget which allows you to prepare the translation only for 1000 words (we call it D, you have to deside which words you want to be in D)

Prepare the corpora wich contains three kind of sentences:
* Upper corpus sentences
* Lower corpus sentences
* sentences derived from Upper/Lower corpus, modified using D

There are many possible ways of doing this, for instance this one (ROT13.COM: hfr rirel fragrapr sebz obgu pbecben gjvpr: jvgubhg nal zbqvsvpngvbaf, naq jvgu rirel jbeqf sebz Q ercynprq ol vgf genafyngvba)

We define the score for an Upper WORD as  $\frac{1}{p}$, where $p$ is a position of its translation in the list of **Lower** words most similar to WORD. For instance, when most similar words to DOG are:

<pre>
WOLF, CAT, WOLVES, LION, gopher, dog
</pre>

then the score for the word DOG is 0.5. Compute the average score separately for words from D, and for words out of D (hint: if the computation takes to much time do it for a random sample).


In [None]:
import unicodedata

In [None]:
def count_bigrams(lower_file, upper_file):
    word_to_count = defaultdict(int)
    punctations = ["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"]
    with open(lower_file, "r", encoding="utf8") as lf, open(upper_file, "r", encoding="utf8") as uf:
        for line in tqdm(lf):
            line = line.strip().split()
            for word in line:
                if all(unicodedata.category(c) in punctations for c in word) or word.isdigit():
                    pass
                else:
                    word_to_count[word] += 1
        for line in tqdm(uf):
            line = line.strip().lower().split()
            for word in line:
                if all(unicodedata.category(c) in punctations for c in word) or word.isdigit():
                    pass
                else:
                    word_to_count[word] += 1
    word_counts = [(k, v) for k, v in word_to_count.items()]
    word_counts = sorted(word_counts, key=lambda x: x[1], reverse=True)
    words = [tup[0] for tup in word_counts]
    return words

def create_new_corpora(words, lower_filename, upper_filename, target_filename):
    translatable_words = set(words)
    corpora = []
    with open(lower_filename, "r", encoding="utf8") as lf, open(upper_filename, "r", encoding="utf8") as uf,\
        open(target_filename, "w", encoding="utf8") as target_file:
        for line in tqdm(lf):
            line = line.strip()
            target_file.write(line+"\n")
#             corpora.append(line)
            splitted_line = line.split()
            translated_line = " ".join(word.upper() if word in translatable_words else word for word in splitted_line)
            corpora.append(translated_line)
        for line in tqdm(uf):
            line = line.strip()
#             corpora.append(line)
            target_file.write(line+"\n")
            splitted_line = line.split()
            translated_line = " ".join(word.lower() if word.lower() in translatable_words else word for word in splitted_line)
            corpora.append(translated_line)
#         np.random.shuffle(corpora)
        for sentence in corpora:
            target_file.write(sentence+"\n")

def calc_score_for_d(words, model, topn=1000):
    scores = []
    for word in tqdm(words):
        upper_words_count = 0
        found = False
        for w, v in model.wv.most_similar(word, topn=topn):
            if w.isupper():
                upper_words_count+=1
                if w.lower() == word:
                    found = True
                    break
        scores.append(0 if not found else 1/upper_words_count)
        
        word = word.upper()
        lower_words_count = 0
        found = False
        for w, v in model.wv.most_similar(word, topn=topn):
            if w.islower():
                lower_words_count+=1
                if w.upper() == word:
                    found = True
                    break
        scores.append(0 if not found else 1/upper_words_count)
    return scores, sum(scores)/len(scores)

def calc_score_for_outside_d(words, allwords, model, n_samples=1000, topn=1000):
    words = set(words)
    allwords = [word for word in allwords if word not in words and all(c == '_' or c.isalpha() for c in word)]
    sampled_words = np.random.choice(allwords, n_samples, replace=False)
    scores = []
    for word in tqdm(sampled_words):
        upper_words_count = 0
        found = False
        try:
            for w, v in model.wv.most_similar(word, topn=topn):
                if w.isupper():
                    upper_words_count+=1
                    if w.lower() == word:
                        found = True
                        break
        except:
            pass
        scores.append(0 if not found else 1/upper_words_count)
        
        word = word.upper()
        lower_words_count = 0
        found = False
        try:
            for w, v in model.wv.most_similar(word, topn=topn):
                if w.islower():
                    lower_words_count+=1
                    if w.upper() == word:
                        found = True
                        break
        except:
            pass
        scores.append(0 if not found else 1/lower_words_count)
    return scores, sum(scores)/len(scores)

In [None]:
allwords = count_bigrams("task3_polish_lower.txt", "task3_polish_upper.txt")
words = allwords[:1000]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
print(words)

['w', 'ten', 'i', 'być', 'na', 'z', 'do', 'się', 'nie_on', 'on', 'o', 'że', 'który', 'a', 'przez', 'ustawa', 'od', 'r.', 'rok_rok_roki', 'oraz', 'za', 'dla', 'po', 'zostać', 'czy', 'jak', 'dzień', 'pan', 'praca', 'co', 'taki', 'projekt', 'państwo', 'osoba', 'minister', 'polski', 'ale', 'mieć_mój', 'wszystki', 'inny', 'również', 'tak', 'móc', 'zmiana', 'komisja', 'ze', 'także', 'sprawa', 'może_móc', 'poseł', 'swój', 'bardzo', 'być_były', 'bycie_być', 'tylko', 'czas', 'ten_tychy', 'chcieć', 'już', 'pierwszy', 'lub_lubić', 'lato_rok', 'zakres', 'tenże_też', 'nowy', 'jednak', 'pytanie', 'działanie', 'miejsce', 'nasz', 'środek', 'art.', 'przepis', 'jaki', 'polska_polski', 'rząd', 'poprawka', 'jako', 'sam', 'należeć', 'przy', 'związek', 'wiele', 'publiczny', 'sytuacja', 'nad', 'sejm', 'warszawa', 'aby', 'możliwość', 'społeczny', 'dotyczący', 'wniosek', 'cel', 'można_możny', 'system', 'powinien', 'europejski', 'strona', 'ministerstwo', 'bo', 'dwa', 'prawo', 'uwaga', 'każdy', 'informacja', 'pr

In [None]:
create_new_corpora(words, "task3_polish_lower.txt", "task3_polish_upper.txt", "task3_append_corpora.txt")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
model = Word2Vec(corpus_file="task3_mixed_corpora.txt", vector_size=100, window=5, min_count=1, workers=4, callbacks=[EpochLogger()])
model.save("task3.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("task3.model")

In [None]:
example_words = words[100:120]

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in model.wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: bo
    ponieważ 0.8838642835617065
    ale 0.8804209232330322
    choć 0.8241131901741028
    oczywiście 0.8039419651031494
    natomiast 0.7966280579566956
    gdyż 0.790541410446167
    niestety 0.787319004535675
    żeby 0.7857784628868103
    że 0.7715550065040588
    który 0.768311619758606

WORD: dwa
    trzy 0.9646552801132202
    cztery 0.9296600222587585
    kilka 0.8342490196228027
    kilku 0.7948956489562988
    kolejny 0.7837705612182617
    jeden 0.7439212799072266
    następny 0.7108233571052551
    oba 0.6999877095222473
    trzeci 0.6966206431388855
    pierwszy 0.6923424601554871

WORD: prawo
    prawo_prawy 0.8459263443946838
    uprawnienie 0.6964763402938843
    prawić_prawo 0.6592034101486206
    przepis 0.6454287767410278
    zasada 0.6298079490661621
    prawie_prawo 0.6043374538421631
    konstytucja 0.5995500087738037
    obowiązek 0.5932927131652832
    możliwość 0.5709974765777588
    wymiar 0.568674623966217

WORD: uwaga
    wzgląd 0.6882945895195007


In [None]:
_, v = calc_score_for_d(words, model, topn=1000)
v

  0%|          | 0/1000 [00:00<?, ?it/s]

0.29150051446646585

In [None]:
_, v = calc_score_for_outside_d(words, allwords, model, n_samples=1000, topn=1000)
v

  0%|          | 0/1000 [00:00<?, ?it/s]

0.42028420553812745

In [None]:
model = Word2Vec(corpus_file="task3_append_corpora.txt", vector_size=100, window=5, min_count=1, workers=4, callbacks=[EpochLogger()])
model.save("task3_append.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
# model = Word2Vec.load("task3_append.model")

In [None]:
example_words = words[100:120]

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in model.wv.most_similar(w0, topn=10):
        print ('   ', w, v)
    print ()

WORD: bo
    ponieważ 0.8948083519935608
    ale 0.8685998320579529
    choć 0.8286816477775574
    oczywiście 0.8150435090065002
    niestety 0.8033995032310486
    natomiast 0.7966384291648865
    gdyż 0.793933629989624
    że 0.779845118522644
    który 0.7764931917190552
    otóż 0.7750731706619263

WORD: dwa
    trzy 0.9663321375846863
    cztery 0.9291338324546814
    kilka 0.8316941857337952
    kilku 0.8042236566543579
    kolejny 0.759882390499115
    jeden 0.7408120036125183
    następny 0.7153948545455933
    jeden_jedny 0.7129083871841431
    ostatni 0.6951188445091248
    oba 0.6892820000648499

WORD: prawo
    prawo_prawy 0.852893054485321
    uprawnienie 0.7194205522537231
    prawić_prawo 0.6411972045898438
    przepis 0.6359310746192932
    zasada 0.617060661315918
    obowiązek 0.6042078137397766
    kodeks 0.5901337265968323
    konstytucja 0.5839157104492188
    prawie_prawo 0.5768797993659973
    możliwość 0.5655156970024109

WORD: uwaga
    wzgląd 0.68783199787139

In [None]:
_, v = calc_score_for_d(words, model, topn=1000)
v

  0%|          | 0/1000 [00:00<?, ?it/s]

0.33330897538816273

In [None]:
_, v = calc_score_for_outside_d(words, allwords, model, n_samples=1000, topn=1000)
v

  0%|          | 0/1000 [00:00<?, ?it/s]

0.4152741299820696

# Task 4 (4 points)

In this task you are asked to do two things:
1. compare the embeddings computed on small corpus (like Brown Corpus , see: <https://en.wikipedia.org/wiki/Brown_Corpus>) with the ones coming from Google News Corpus
2. Try to use other resourses like WordNet to enrich to corpus, and obtain better embeddings

You can use the following code snippets:

```python
# printing tokenized Brown Corpora
from nltk.corpus import brown
for s in brown.sents():
    print(*s)
    
#iterating over all synsets in WordNet
from nltk.corpus import wordnet as wn

for synset_type in 'avrns': # n == noun, v == verb, ...
    for synset in list(wn.all_synsets(synset_type)))[:10]:
        print (synset.definition())
        print (synset.examples())
        print ([lem.name() for lem in synset.lemmas()])
        print (synset.hyperonims()) # nodes 1 level up in ontology
        
# loading model and compute cosine similarity between words

model = Word2Vec.load('models/w2v.wordnet5.model') 
print (model.wv.similarity('dog', 'cat'))
```

Embeddings will be tested using WordSim-353 dataset, the code showing the quality is in the cell below. Prepare the following corpora:
1. Tokenized Brown Corpora
2. Definitions and examples from Princeton WordNet
3. (1) and (2) together
4. (3) enriched with pseudosentences containing (a subset) of WordNet knowledge (such as 'tiger is a carnivore')

Train 4 Word2Vec models, and raport Spearman correletion between similarities based on your vectors, and similarities based on human judgements.



In [None]:
!pip install --upgrade gensim



## Google News Corpus

In [None]:
# Code for computing correlation between W2V similarity, and human judgements

import gensim.downloader
from scipy.stats import spearmanr
gn = gensim.downloader.load('word2vec-google-news-300')

In [None]:
for similarity_type in ['relatedness', 'similarity']:
    ws353 = []
    vals = []
    ys = []
    for x in open(f'task4_wordsim_{similarity_type}_goldstandard.txt'): 
        a,b,val = x.split()
        val = float(val)
        ws353.append( (a,b,val))
        vals.append(val)
        ys.append(gn.similarity(a, b))
    # spearmanr returns 2 vallues: correlation and pval. pval should be close to zero
    
    print(similarity_type + ':', spearmanr(vals, ys))

relatedness: SpearmanrResult(correlation=0.6354514099606292, pvalue=6.623359420760354e-30)
similarity: SpearmanrResult(correlation=0.7717239276951675, pvalue=2.2385731613138314e-41)


## Brown Corpora

In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')
# for s in brown.sents()[:10]:
#     print(s)

with open("brown_sentences.txt", "w", encoding="utf8") as brown_file:
    for s in brown.sents():
        brown_file.write(" ".join(s)+"\n")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
model = Word2Vec(corpus_file="brown_sentences.txt", vector_size=100, window=5, min_count=1, workers=4, callbacks=[EpochLogger()])
model.save("brown_sentences.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("brown_sentences.model")

In [None]:
# What should we return then? I decided that we will skip examples with words unknown to our model
model.wv.similarity('OPEC', 'country')

KeyError: ignored

In [None]:
from scipy.stats import spearmanr

for similarity_type in ['relatedness', 'similarity']:
    ws353 = []
    vals = []
    ys = []
    for x in open(f'task4_wordsim_{similarity_type}_goldstandard.txt'): 
        a,b,val = x.split()
        val = float(val)
        ws353.append( (a,b,val))
        try:
            ys.append(model.wv.similarity(a, b))
            vals.append(val)
        except KeyError:
            pass
    # spearmanr returns 2 vallues: correlation and pval. pval should be close to zero
    
    print(similarity_type + ':', spearmanr(vals, ys))

relatedness: SpearmanrResult(correlation=0.08412994886846215, pvalue=0.20168258853600937)
similarity: SpearmanrResult(correlation=0.10166031487280731, pvalue=0.17207591679340606)


## Wordnet

In [None]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

with open("wordnet_sentences.txt", "w", encoding="utf8") as wordnet_file:
    for synset_type in 'avrns': # n == noun, v == verb, ...
        for synset in wn.all_synsets(synset_type):
            wordnet_file.write(synset.definition()+"\n")
            for example in synset.examples():
                wordnet_file.write(example+" \n")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
model = Word2Vec(corpus_file="wordnet_sentences.txt", vector_size=100, window=5, min_count=1, workers=4, callbacks=[EpochLogger()])
model.save("wordnet_sentences.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("wordnet_sentences.model")

In [None]:
from scipy.stats import spearmanr

for similarity_type in ['relatedness', 'similarity']:
    ws353 = []
    vals = []
    ys = []
    for x in open(f'task4_wordsim_{similarity_type}_goldstandard.txt'): 
        a,b,val = x.split()
        val = float(val)
        ws353.append( (a,b,val))
        try:
            ys.append(model.wv.similarity(a, b))
            vals.append(val)
        except KeyError:
            pass
    # spearmanr returns 2 vallues: correlation and pval. pval should be close to zero
    
    print(similarity_type + ':', spearmanr(vals, ys))

relatedness: SpearmanrResult(correlation=0.30314952838601944, pvalue=1.0894969307572987e-06)
similarity: SpearmanrResult(correlation=0.39139285389932144, pvalue=1.0894677233239208e-08)


## Brown Corpora and Wordnet

In [None]:
import numpy as np

sentences = []
with open("wordnet_sentences.txt", "r", encoding="utf8") as wordnet_file:
    for line in wordnet_file:
        sentences.append(line)

with open("brown_sentences.txt", "r", encoding="utf8") as brown_file:
    for line in brown_file:
        sentences.append(line)

np.random.shuffle(sentences)
with open("brown_wordnet_sentences.txt", "w", encoding="utf8") as brown_wordnet_file:
    for line in sentences:
        brown_wordnet_file.write(line)

In [None]:
model = Word2Vec(corpus_file="brown_wordnet_sentences.txt", vector_size=100, window=5, min_count=1, workers=4, callbacks=[EpochLogger()])
model.save("brown_wordnet_sentences.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("brown_wordnet_sentences.model")

In [None]:
from scipy.stats import spearmanr

for similarity_type in ['relatedness', 'similarity']:
    ws353 = []
    vals = []
    ys = []
    for x in open(f'task4_wordsim_{similarity_type}_goldstandard.txt'): 
        a,b,val = x.split()
        val = float(val)
        ws353.append( (a,b,val))
        try:
            ys.append(model.wv.similarity(a, b))
            vals.append(val)
        except KeyError:
            pass
    # spearmanr returns 2 vallues: correlation and pval. pval should be close to zero
    
    print(similarity_type + ':', spearmanr(vals, ys))

relatedness: SpearmanrResult(correlation=0.25765754255958223, pvalue=3.6015207263190124e-05)
similarity: SpearmanrResult(correlation=0.4496753545181397, pvalue=1.9004113566906157e-11)


## Brown Corpora and Wordnet enriched with pseudosentences containing (a subset) of WordNet knowledge (such as 'tiger is a carnivore')

In [None]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

with open("wordnet_pseudosentences.txt", "w", encoding="utf8") as wordnet_file:
    for synset_type in 'avrns': # n == noun, v == verb, ...
        for synset in wn.all_synsets(synset_type):
            lemmas = [lem.name() for lem in synset.lemmas()]
            hypernyms = [str(lemma.name()) for hypernym in synset.hypernyms() for lemma in hypernym.lemmas()]
            for lemma in lemmas:
                for hypernym in hypernyms:
                    wordnet_file.write(lemma + " is " + hypernym + "\n")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

with open("wordnet_pseudosentences.txt", "w", encoding="utf8") as wordnet_file:
    for synset_type in 'avrns': # n == noun, v == verb, ...
        for synset in wn.all_synsets(synset_type):
            lemmas = [lem.name() for lem in synset.lemmas()]
            hypernyms = [str(lemma.name()) for hypernym in synset.hypernyms() for lemma in hypernym.lemmas()]
            for lemma in lemmas:
                for hypernym in hypernyms:
                    wordnet_file.write(lemma + " " + hypernym + "\n")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import numpy as np

sentences = []
with open("wordnet_sentences.txt", "r", encoding="utf8") as wordnet_file:
    for line in wordnet_file:
        sentences.append(line)

with open("brown_sentences.txt", "r", encoding="utf8") as brown_file:
    for line in brown_file:
        sentences.append(line)

with open("wordnet_pseudosentences.txt", "r", encoding="utf8") as wordnet_pseudosentences_file:
    for line in wordnet_pseudosentences_file:
        sentences.append(line)

np.random.shuffle(sentences)
with open("brown_wordnet_pseudosentences.txt", "w", encoding="utf8") as brown_wordnet_file:
    for line in sentences:
        brown_wordnet_file.write(line)

In [None]:
model = Word2Vec(corpus_file="brown_wordnet_pseudosentences.txt", vector_size=100, window=5, min_count=1, workers=4, callbacks=[EpochLogger()])
model.save("brown_wordnet_pseudosentences.model")

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [None]:
model = Word2Vec.load("brown_wordnet_pseudosentences.model")

In [None]:
from scipy.stats import spearmanr

for similarity_type in ['relatedness', 'similarity']:
    ws353 = []
    vals = []
    ys = []
    for x in open(f'task4_wordsim_{similarity_type}_goldstandard.txt'): 
        a,b,val = x.split()
        val = float(val)
        ws353.append( (a,b,val))
        try:
            ys.append(model.wv.similarity(a, b))
            vals.append(val)
        except KeyError:
            pass
    # spearmanr returns 2 vallues: correlation and pval. pval should be close to zero
    
    print(similarity_type + ':', spearmanr(vals, ys))

relatedness: SpearmanrResult(correlation=0.25167330737989807, pvalue=5.5198604348450965e-05)
similarity: SpearmanrResult(correlation=0.47384148410064225, pvalue=9.30227478109151e-13)


## Random code

In [None]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

for synset_type in 'avrns': # n == noun, v == verb, ...
    for synset in list(wn.all_synsets(synset_type))[:10]:
        print(synset.definition())
        print(synset.examples())
        print([lem.name() for lem in synset.lemmas()])
        # print([str(lemma.name()) for synset_lemma in synset.lemmas() for hypernym in synset_lemma.hypernyms() for lemma in hypernym.lemmas()])
        print([str(lemma.name()) for hypernym in synset.hypernyms() for lemma in hypernym.lemmas()]) # nodes 1 level up in ontology
        print("-"*50)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
(usually followed by `to') having the necessary means or skill or know-how or authority to do something
['able to swim', 'she was able to program her computer', 'we were at last able to buy a car', 'able to get a grant for the project']
['able']
[]
--------------------------------------------------
(usually followed by `to') not having the necessary means or skill or know-how
['unable to get to town without a car', 'unable to obtain funds']
['unable']
[]
--------------------------------------------------
facing away from the axis of an organ or organism
['the abaxial surface of a leaf is the underside or side facing away from the stem']
['abaxial', 'dorsal']
[]
--------------------------------------------------
nearest to or facing toward the axis of an organ or organism
['the upper side of a leaf is known as the adaxial surface']
['adaxial', 'ventral']
[]
----------------

In [None]:
with open("wordnet_pseudosentences.txt", "r", encoding="utf8") as wordnet_file:
    line = wordnet_file.readline()
    line2 = wordnet_file.readline()
    line3 = wordnet_file.readline()
line, line2, line3

('respire is undergo\n', 'respire is breathe\n', 'respire is take_a_breath\n')