In [48]:
import io
import os
import numpy as np
import scipy
from scipy import linalg
import sys
from smart_open import smart_open
import csv
from collections import defaultdict

In [11]:
PATH_TO_DATA = os.path.join(os.getcwd(),'data')
os.listdir(PATH_TO_DATA)
wordnet_mammal_file = os.path.join(PATH_TO_DATA, 'wordnet_mammal_hypernyms.tsv')

In [161]:
class PoincareData():
    def __init__(self,fname, nmax, verbose = False, doublon_tol = False):
        '''
        fname: name of the tsv file 
        nmax: number of extracted lines in the fname file
        '''
        l = self.load_file(fname,nmax, verbose)
        self.build_vocab(l, verbose, doublon_tol)
        
    def load_file(self,fname, nmax, verbose):
        '''
        Parsing of the TSV file
        '''
        entire = True
        with open(fname,'r') as f:
            l = []
            for i, line in enumerate(f):
                if i >= nmax:
                    entire = False
                    break
                line = line.strip().split('\t')
                l.append(tuple(line))
        if verbose:
            print('Entire Parsing: ' + str(entire))
        return l
                
    def build_vocab(self,loaded_file, verbose, doublon_tol = False):
        '''
        Given a loaded_file, build the index2word, word2index, the vocab, node relations
        param:
            - vocab: occurence of the different words (defaultdict)
            - index2word: index associated to a word (list)
            - all_relations : list of tuples containing the interactions (list)
            - word2index: (dict)
            - node_relations: mapping from node index to its related node indices
        '''
        self.vocab = defaultdict(lambda: 0)
        self.index2word = [] # position du mot dans la liste est l'index
        self.all_relations = []
        self.word2index = {}
        self.node_relations = defaultdict(set)
        doublon = 0
        #self.wordfrequency = {}
        for relation in loaded_file:
            if relation[0] == relation[1]:
                doublon +=1
            if len(relation) != 2:
                raise ValueError('Relation pair "%s" should be a pair !' % repr(relation))
            if (doublon_tol == True):
                for w in relation:
                    if w in self.vocab:
                        self.vocab[w] +=1
                    else:
                        # new word detected
                        self.word2index[w] = len(self.index2word) # we give the new word its own index
                        self.index2word.append(w) # new word in the list
                        self.vocab[w] = 1 # new key in the vocab dictionary

                node1,node2 = relation
                node1_index, node2_index = self.word2index[node1], self.word2index[node2]
                self.node_relations[node1_index].add(node2_index)
                self.all_relations.append((node1_index, node2_index))
            else:
                if relation[0] != relation[1]:
                    for w in relation:
                        if w in self.vocab:
                            self.vocab[w] +=1
                        else:
                            # new word detected
                            self.word2index[w] = len(self.index2word) 
                            self.index2word.append(w) 
                            self.vocab[w] = 1

                    node1,node2 = relation
                    node1_index, node2_index = self.word2index[node1], self.word2index[node2]
                    self.node_relations[node1_index].add(node2_index)
                    self.all_relations.append((node1_index, node2_index))
        if verbose:
            print('Vocabulary Build !')
            print(str(doublon) + ' doublons was found')
                
    

In [171]:
data = PoincareData(wordnet_mammal_file, 20, verbose= True)

Entire Parsing: False
Vocabulary Build !
5 doublons was found


In [172]:
len(data.all_relations)

15

In [174]:
data2 = PoincareData(wordnet_mammal_file, 20, verbose = True, doublon_tol=True)

Entire Parsing: False
Vocabulary Build !
5 doublons was found


In [176]:
len(data2.all_relations)

20

In [177]:
print(data.index2word)
print(len(data.index2word))

['kangaroo.n.01', 'marsupial.n.01', 'domestic_goat.n.01', 'even-toed_ungulate.n.01', 'rock_squirrel.n.01', 'ground_squirrel.n.02', 'vizsla.n.01', 'dog.n.01', 'dandie_dinmont.n.01', 'mammal.n.01', 'broodmare.n.01', 'horse.n.01', 'lesser_kudu.n.01', 'placental.n.01', 'water_shrew.n.01', 'insectivore.n.01', 'silky_anteater.n.01', 'giant_kangaroo.n.01', 'metatherian.n.01', 'seattle_slew.n.01', 'thoroughbred.n.02', 'boxer.n.04', 'rabbit.n.01', 'longhorn.n.01', 'bovid.n.01', 'blue_fox.n.01', 'fox.n.01']
27


In [179]:
print(data2.index2word)
print(len(data2.index2word))

['kangaroo.n.01', 'marsupial.n.01', 'domestic_goat.n.01', 'even-toed_ungulate.n.01', 'rock_squirrel.n.01', 'ground_squirrel.n.02', 'vizsla.n.01', 'dog.n.01', 'dandie_dinmont.n.01', 'mammal.n.01', 'broodmare.n.01', 'horse.n.01', 'spotted_skunk.n.01', 'hispid_pocket_mouse.n.01', 'lesser_kudu.n.01', 'placental.n.01', 'water_shrew.n.01', 'insectivore.n.01', 'silky_anteater.n.01', 'giant_kangaroo.n.01', 'metatherian.n.01', 'bronco.n.01', 'pekinese.n.01', 'seattle_slew.n.01', 'thoroughbred.n.02', 'kinkajou.n.01', 'boxer.n.04', 'rabbit.n.01', 'longhorn.n.01', 'bovid.n.01', 'blue_fox.n.01', 'fox.n.01']
32


In [181]:
data.index2word

['kangaroo.n.01',
 'marsupial.n.01',
 'domestic_goat.n.01',
 'even-toed_ungulate.n.01',
 'rock_squirrel.n.01',
 'ground_squirrel.n.02',
 'vizsla.n.01',
 'dog.n.01',
 'dandie_dinmont.n.01',
 'mammal.n.01',
 'broodmare.n.01',
 'horse.n.01',
 'lesser_kudu.n.01',
 'placental.n.01',
 'water_shrew.n.01',
 'insectivore.n.01',
 'silky_anteater.n.01',
 'giant_kangaroo.n.01',
 'metatherian.n.01',
 'seattle_slew.n.01',
 'thoroughbred.n.02',
 'boxer.n.04',
 'rabbit.n.01',
 'longhorn.n.01',
 'bovid.n.01',
 'blue_fox.n.01',
 'fox.n.01']

In [182]:
data2.index2word

['kangaroo.n.01',
 'marsupial.n.01',
 'domestic_goat.n.01',
 'even-toed_ungulate.n.01',
 'rock_squirrel.n.01',
 'ground_squirrel.n.02',
 'vizsla.n.01',
 'dog.n.01',
 'dandie_dinmont.n.01',
 'mammal.n.01',
 'broodmare.n.01',
 'horse.n.01',
 'spotted_skunk.n.01',
 'hispid_pocket_mouse.n.01',
 'lesser_kudu.n.01',
 'placental.n.01',
 'water_shrew.n.01',
 'insectivore.n.01',
 'silky_anteater.n.01',
 'giant_kangaroo.n.01',
 'metatherian.n.01',
 'bronco.n.01',
 'pekinese.n.01',
 'seattle_slew.n.01',
 'thoroughbred.n.02',
 'kinkajou.n.01',
 'boxer.n.04',
 'rabbit.n.01',
 'longhorn.n.01',
 'bovid.n.01',
 'blue_fox.n.01',
 'fox.n.01']

In [183]:
data.node_relations

defaultdict(set,
            {0: {1},
             2: {3},
             4: {5},
             6: {7},
             8: {9},
             10: {11},
             12: {13},
             14: {15},
             16: {13},
             17: {18},
             19: {20},
             21: {9},
             22: {13},
             23: {24},
             25: {26}})

In [184]:
data2.node_relations

defaultdict(set,
            {0: {1},
             2: {3},
             4: {5},
             6: {7},
             8: {9},
             10: {11},
             12: {12},
             13: {13},
             14: {15},
             16: {17},
             18: {15},
             19: {20},
             21: {21},
             22: {22},
             23: {24},
             25: {25},
             26: {9},
             27: {15},
             28: {29},
             30: {31}})

In [120]:
data.word2index

{'broodmare.n.01': 10,
 'dandie_dinmont.n.01': 8,
 'dog.n.01': 7,
 'domestic_goat.n.01': 2,
 'even-toed_ungulate.n.01': 3,
 'ground_squirrel.n.02': 5,
 'hispid_pocket_mouse.n.01': 13,
 'horse.n.01': 11,
 'insectivore.n.01': 17,
 'kangaroo.n.01': 0,
 'lesser_kudu.n.01': 14,
 'mammal.n.01': 9,
 'marsupial.n.01': 1,
 'placental.n.01': 15,
 'rock_squirrel.n.01': 4,
 'spotted_skunk.n.01': 12,
 'vizsla.n.01': 6,
 'water_shrew.n.01': 16}

In [185]:
data.vocab

defaultdict(<function __main__.PoincareData.build_vocab.<locals>.<lambda>>,
            {'blue_fox.n.01': 1,
             'bovid.n.01': 1,
             'boxer.n.04': 1,
             'broodmare.n.01': 1,
             'dandie_dinmont.n.01': 1,
             'dog.n.01': 1,
             'domestic_goat.n.01': 1,
             'even-toed_ungulate.n.01': 1,
             'fox.n.01': 1,
             'giant_kangaroo.n.01': 1,
             'ground_squirrel.n.02': 1,
             'horse.n.01': 1,
             'insectivore.n.01': 1,
             'kangaroo.n.01': 1,
             'lesser_kudu.n.01': 1,
             'longhorn.n.01': 1,
             'mammal.n.01': 2,
             'marsupial.n.01': 1,
             'metatherian.n.01': 1,
             'placental.n.01': 3,
             'rabbit.n.01': 1,
             'rock_squirrel.n.01': 1,
             'seattle_slew.n.01': 1,
             'silky_anteater.n.01': 1,
             'thoroughbred.n.02': 1,
             'vizsla.n.01': 1,
             'water_shrew.

In [None]:
class abc():
    def __init__(self, fname, nmax=100):
        self.load_data(fname, nmax)
        self.word2id = {}
        for i,word in enumerate(self.word2vec.keys()):
            self.word2id[word] = i
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array(list(self.word2vec.values()))
    
    def load_wordvec(self, fname, nmax):
        self.data = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        res = []
        score = np.zeros(len(self.id2word.keys()))
        for key,val in self.id2word.items():
            score[key] = self.score(w,val)
        for i in score.argsort()[::-1][1:(K+1)]:
            res.append(w2v.id2word[i])
        # K most similar words: self.score  -  np.argsort 
        return res

    def score(self, w1, w2):
        vec_1 = self.word2vec[w1]
        vec_2 = self.word2vec[w2]
        return np.dot(vec_1,vec_2)/(np.linalg.norm(vec_1)*np.linalg.norm(vec_2))

