In [1]:
#Þessi klasi fékkst á þræðinum https://github.com/facebookresearch/fastText/pull/552
#Þar sem Python Api bauð ekki upp á NN greiningu, þá er farinn þessi leið
import numpy as np
import subprocess
import fasttext
import pandas as pd
import subprocess
from fasttext import load_model

class FastTextNN:
    
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
    
    def find_nearest_neighbor(self, query_word, vectors, n=10,  cossims=None):
        """
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        
        query  = self.ft_model.get_word_vector(query_word)
        if cossims is None:
            cossims = np.matmul(vectors, query, out=cossims)

        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        if query_word in self.ft_words:
            result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        else:
            result_i = np.argpartition(-cossims, range(n+1))[0:n]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=10, word_freq=None):
        result = self.find_nearest_neighbor(word, self.ft_matrix, n=n)
        if word_freq:
            return [(self.ft_words[r
            [0]], round(r[1],3)) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], round(r[1],3)) for r in result]

model = load_model('rm_trained_data/rmh_uncased_ordflokkar.bin')
fasttext_nn = FastTextNN(model)

print('Success')



Success


In [2]:
rm_adjectives = []

f = open("adj_data/all_adj.txt", encoding="utf8")
for x in f:
    rm_adjective = x[:-1]
    rm_adjectives.append(rm_adjective)

prenoms = []    
    
f = open("adj_data/prenoms_adj.txt", encoding="utf8")
for x in f:
    prenom = x.split()
    prenoms.append(prenom[:-1])
    
germanet_flokkar = []

f = open("germanet_data/germanet_categories.txt", encoding="utf8")
for x in f:
    germanet_flokkur = x.split()
    germanet_flokkar.append(germanet_flokkur)
    
yfirflokkar = []
undirflokkar = []
leitarord = []

for i in germanet_flokkar:
    yfirflokkar.append(i[0])
    undirflokkar.append(i[1])
    leitarord1 = []
    for j in range(2,len(i)):
        leitarord1.append(i[j])
    leitarord.append(leitarord1)

In [3]:
def cosine_similarity(a:list, b:list):

    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    return min(dot_product / (norm_a * norm_b), 1.0)

In [4]:
def nearest_neighbours(word:str, n:int):
    
    to_print = {}
    results = fasttext_nn.nearest_words(word, n+1)
    to_print[word] = results

    #print(f'\nThe word "{word}" appears {get_uniq_freq(word)} times in the training \ndata as a uniq word but as a subword it appears {get_freq(word)}')
    df = pd.DataFrame.from_dict(to_print)
    pd.set_option('display.max_rows', df.shape[0]+1)

    return df

In [5]:
def anchor_compare(word:str):

    target_word_vec = model.get_word_vector(word)
    
    similarities = []

    for word in leitarord:
        
        test_word_vec = []
        similarity = []
        
        for i in word:
            
            test_word_vec = model.get_word_vector(i)
            similarity.append(cosine_similarity(target_word_vec, test_word_vec))
            max_similarity = max(similarity)
        
        similarities.append(max_similarity)
    
    return similarities

In [6]:
def categorize(similarities:list):
    
    max_similarity = max(similarities)
    category_index = similarities.index(max_similarity)
    
    supercategory = yfirflokkar[category_index]
    subcategory = undirflokkar[category_index]
    
    return max_similarity, supercategory, subcategory

In [7]:
def flokka(word:str):

    likindi = anchor_compare(word)
    likindi, yfirflokkur, undirflokkur = categorize(likindi)
    
    return likindi, yfirflokkur, undirflokkur

In [8]:
def syna_flokkun(word, likindi, yfirflokkur, undirflokkur):
    
    print("Orð sem er er skoðað er: ", word)
    print("Áætlaður yfirflokkur er: ", yfirflokkur)
    print("Áætlaður undirflokkur er: ", undirflokkur)
    print("Líkindi á þessari flokkun er: {:.3%}".format(likindi))

In [9]:
def flokka_allt(words:list):
    
    oll_ord = []
    
    for word in words:
        
        eitt_ord = []
        
        likindi, yfirflokkur, undirflokkur = flokka(word)
        
        eitt_ord.append(word)
        eitt_ord.append(yfirflokkur)
        eitt_ord.append(undirflokkur)
        eitt_ord.append(likindi)
        
        oll_ord.append(eitt_ord)
        
    return oll_ord

In [10]:
def words_to_categories(words:list):
    
    complete = 0
    
    prenom_likindir = []
    prenom_yfirflokkar = []
    prenom_undirflokkar = []
    
    for item in words:
        
        prenom_likindi = []
        prenom_yfirflokkur = []
        prenom_undirflokkur = []
        
        for element in item:
            
            likindi, yfirflokkur, undirflokkur = flokka(element)
            prenom_likindi.append(likindi)
            prenom_yfirflokkur.append(yfirflokkur)
            prenom_undirflokkur.append(undirflokkur)
            
        prenom_likindir.append(prenom_likindi)
        prenom_yfirflokkar.append(prenom_yfirflokkur)
        prenom_undirflokkar.append(prenom_undirflokkur)
        
        complete += 1
        
        print(f'{complete/len(words)}\r', end="")
    
    return prenom_likindir, prenom_yfirflokkar, prenom_undirflokkar

In [11]:
word = 'grænn'

likindi, yfirflokkur, undirflokkur = flokka(word)
syna_flokkun(word, likindi, yfirflokkur, undirflokkur)

Orð sem er er skoðað er:  grænn
Áætlaður yfirflokkur er:  skyn
Áætlaður undirflokkur er:  litur
Líkindi á þessari flokkun er: 100.000%


In [12]:
# Varúð! Tekur u.þ.b. 2 klst. að keyra
import time

t0 = time.time()
#prenoms_likindi, prenoms_yfirflokkar, prenoms_undirflokkar = words_to_categories(prenoms)
t1 = time.time()

t1-t0
# Muna að keyra filewrite-kóðann fyrir neðan til að geyma gögnin

0.0

In [13]:

oll_ord = flokka_allt(rm_adjectives)
    
with open('adj_data/all_adj_categories.txt', 'w', encoding='utf8') as f:
    for word in oll_ord:
        for element in word:
            f.write("%s\t" % element)
        f.write("\n")

with open('adj_data/prenoms_cossims.txt', 'w', encoding='utf8') as f:
    for word in prenoms_likindi:
        for element in word:
            f.write("%s\t" % element)
        f.write("\n")
        
with open('adj_data/prenoms_supercategories.txt', 'w', encoding='utf8') as f:
    for word in prenoms_yfirflokkar:
        for element in word:
            f.write("%s\t" % element)
        f.write("\n")

with open('adj_data/prenoms_subcategories.txt', 'w', encoding='utf8') as f:
    for word in prenoms_undirflokkar:
        for element in word:
            f.write("%s\t" % element)
        f.write("\n")


KeyboardInterrupt: 

In [15]:
model.get_word_vector('góður')

array([ 0.03521131, -0.39758083, -0.21110046, -0.34735507, -0.02270238,
        0.16828084, -0.63982373,  0.13193065,  0.18884122, -0.8536364 ,
       -0.10363223, -0.25496468, -0.3937832 , -0.14288074,  0.21922408,
       -0.15311104,  0.43095523,  0.21226801,  0.6642302 , -0.2575809 ,
        0.8947417 ,  0.5550092 , -0.17688587,  0.03226669, -0.15235232,
        0.49030012,  0.15468   ,  0.02502565,  0.6275847 , -0.02230142,
       -0.0473647 ,  0.33030105,  0.04448915, -0.0357025 ,  0.4458103 ,
       -0.10969002, -0.02217476, -0.75410646,  0.4130749 ,  0.18564804,
       -0.01329854,  0.07469093, -0.01620433,  0.63842326,  0.14110668,
        0.03852937, -0.27640206,  0.04559644,  0.15444066,  0.44625342,
       -0.04936298,  0.1669785 , -0.27535313, -0.60768616,  0.22384384,
       -0.18100673,  0.08767515,  0.27254322, -0.07303666, -0.61049104,
        0.30384472, -0.43470567,  0.80093867, -0.2533552 , -0.56275123,
       -0.18019162, -0.39956596,  0.23420742, -0.7113855 ,  0.14