In [1]:
#Þessi klasi fékkst á þræðinum https://github.com/facebookresearch/fastText/pull/552
#Þar sem Python Api bauð ekki upp á NN greiningu, þá er farinn þessi leið
import numpy as np
import subprocess
import fasttext
import pandas as pd
import subprocess
from fasttext import load_model

class FastTextNN:
    
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
    
    def find_nearest_neighbor(self, query_word, vectors, n=10,  cossims=None):
        """
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        
        query  = self.ft_model.get_word_vector(query_word)
        if cossims is None:
            cossims = np.matmul(vectors, query, out=cossims)

        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        if query_word in self.ft_words:
            result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        else:
            result_i = np.argpartition(-cossims, range(n+1))[0:n]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=10, word_freq=None):
        result = self.find_nearest_neighbor(word, self.ft_matrix, n=n)
        if word_freq:
            return [(self.ft_words[r
            [0]], round(r[1],3)) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], round(r[1],3)) for r in result]

model = load_model('data/rmh_uncased_ordflokkar.bin')
fasttext_nn = FastTextNN(model)

print('Success')



Success


In [60]:
rm_adjectives = []

f = open("lysingarord.txt", encoding="utf8")
for x in f:
    rm_adjective = x[:-1]
    rm_adjectives.append(rm_adjective)
    
germanet_flokkar = []

f = open("germanet_flokkar.txt", encoding="utf8")
for x in f:
    germanet_flokkur = x.split()
    germanet_flokkar.append(germanet_flokkur)
    
yfirflokkar = []
undirflokkar = []
leitarord = []

for i in germanet_flokkar:
    yfirflokkar.append(i[0])
    undirflokkar.append(i[1])
    leitarord1 = []
    leitarord1.append(i[2])
    leitarord1.append(i[3])
    leitarord.append(leitarord1)

In [7]:
def cosine_similarity(a:list, b:list):

    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    return dot_product / (norm_a * norm_b)

In [8]:
def nearest_neighbours(word:str, n:int):
    
    to_print = {}
    results = fasttext_nn.nearest_words(word, n+1)
    to_print[word] = results

    #print(f'\nThe word "{word}" appears {get_uniq_freq(word)} times in the training \ndata as a uniq word but as a subword it appears {get_freq(word)}')
    df = pd.DataFrame.from_dict(to_print)
    pd.set_option('display.max_rows', df.shape[0]+1)

    return df

In [9]:
def anchor_compare(word:str):

    target_word_vec = model.get_word_vector(word)
    
    similarities = []

    for i in leitarord:

        test_word_vec1 = model.get_word_vector(i[0])
        test_word_vec2 = model.get_word_vector(i[1])        
        
        similarity1 = cosine_similarity(target_word_vec, test_word_vec1)
        similarity2 = cosine_similarity(target_word_vec, test_word_vec2)

        similarity = max(similarity1, similarity2)
        
        similarities.append(similarity)
    
    return similarities

In [10]:
def categorize(similarities:list):
    
    max_similarity = max(similarities)
    category_index = similarities.index(max_similarity)
    
    supercategory = yfirflokkar[category_index]
    subcategory = undirflokkar[category_index]
    
    return max_similarity, supercategory, subcategory

In [62]:
def flokka(word:str):

    likindi = anchor_compare(word)
    likindi, yfirflokkur, undirflokkur = categorize(likindi)
    
    return likindi, yfirflokkur, undirflokkur

In [63]:
def syna_flokkun(word, likindi, yfirflokkur, undirflokkur):
    
    print("Orð sem er er skoðað er: ", word)
    print("Áætlaður yfirflokkur er: ", yfirflokkur)
    print("Áætlaður undirflokkur er: ", undirflokkur)
    print("Líkindi á þessari flokkun er: {:.3%}".format(likindi))

In [74]:
def flokka_allt(words:list):
    
    oll_ord = []
    
    for word in words:
        
        eitt_ord = []
        
        likindi, yfirflokkur, undirflokkur = flokka(word)
        
        eitt_ord.append(word)
        eitt_ord.append(yfirflokkur)
        eitt_ord.append(undirflokkur)
        eitt_ord.append(likindi)
        
        oll_ord.append(eitt_ord)
        
    return oll_ord

In [65]:
word = 'glaður'

likindi, yfirflokkur, undirflokkur = flokka(word)
syna_flokkun(word, likindi, yfirflokkur, undirflokkur)

Orð sem er er skoðað er:  glaður
Áætlaður yfirflokkur er:  skap
Áætlaður undirflokkur er:  tilfinning
Líkindi á þessari flokkun er: 100.000%


In [86]:
"""
oll_ord = flokka_allt(rm_adjectives)

with open('full_flokkun_lo.txt', 'w', encoding='utf8') as f:
    for word in oll_ord:
        for element in word:
            f.write("%s\t" % element)
        f.write("\n")
"""

'\noll_ord = flokka_allt(rm_adjectives)\n\nwith open(\'full_flokkun_lo.txt\', \'w\', encoding=\'utf8\') as f:\n    for word in oll_ord:\n        for element in word:\n            f.write("%s\t" % element)\n        f.write("\n")\n'