In [1]:
from gensim.models import KeyedVectors
import sys
sys.path.append("../../imports/")
import saver as sv
import numpy as np
import pprint
pp = pprint.PrettyPrinter(indent=4)

word2desc = sv.load("word2desc")

In [2]:
from numba import jit



In [3]:
#w2v = '/home/manni/embs/word2vec-google-news-300.gz'
#timeit model = KeyedVectors.load_word2vec_format(w2v, binary=True)
model = KeyedVectors.load('/home/manni/embs/w2v.model')

In [4]:
WINDOW = 3

In [5]:
def sense_vec(word,model):
    '''
    Computes average vectors from all sense descriptions. 
    Parameters
    ----------
    word : str
        Unicode or utf-8 encoded string.
    model : KeyedVectors
        gensim KeyedVectors object
    Returns
    -------
    list
        List of [Vector,Vector,....].
    '''
    dim = model.vector_size
    v = list()
    pad = np.zeros((1, model.vector_size),dtype=np.float32)
    if word not in word2desc:
        return pad
    for words in word2desc[word]:
        if not words:
            continue
        _v = list()
        for _word in words:
            if _word in model.vocab:
                _v.append(model.get_vector(_word))
        if not _v:
            _v = pad
        _v = np.sum(_v,axis=0)
        v.append(_v)
    if len(v)<1:
        return pad
    assert np.asarray(v).ndim == 2, 'SenseVec is not 2D.'
    return v

In [None]:
def tokenize(tokens,WINDOW,model):
    to_replace = dict()
    for i,token in enumerate(tokens):
        if token in word2desc:
            left = tokens[i-WINDOW:i]
            right = tokens[i+1:i+WINDOW+1]
            context = set(left+right)
            tsvecs = sense_vec(token,model)
            maxi = 0
            tag = -1 #index in wordsense
            for si,v in enumerate(tsvecs):
                for con in context:
                    if con == token:
                        continue
                    _svecs = sense_vec(con,model)
                    for sj,_v in enumerate(_svecs):
                        if np.sum(_v)==0:
                            continue
                        sim = model.cosine_similarities(v, [_v])[0]
                        if sim>maxi:
                            maxi = sim
                            tag = si
            if tag>=0:
                to_replace[i]=token+'#'+str(tag)

In [None]:
tokens = 'part river where current very fast forward'
tokens = tokens.split()

In [None]:
%timeit tokenize(tokens,3,model)

## numba version

In [6]:
def get_max(tokens,WINDOW,model):
    tmax = 0
    for i,token in enumerate(tokens):
        if token in word2desc:
            tsvecs = sense_vec(token,model) # sense vectors for current token
            _tmax=len(tsvecs)
            if _tmax>tmax:
                tmax=_tmax
    return tmax

In [13]:
def get_vecs(tokens,WINDOW,model):
    tmax = get_max(tokens,WINDOW,model)
    pad = np.zeros((model.vector_size),dtype=np.float32)
    token_vecs = list()
    token_con_vecs = list()
    for i,token in enumerate(tokens):
        if token in word2desc:
            left = tokens[i-WINDOW:i]
            right = tokens[i+1:i+WINDOW+1]
            context = set(left+right)
            tsvecs = sense_vec(token,model) # sense vectors for current token
            tsvecs = tsvecs + [pad]*(tmax-len(tsvecs)) 
            csvecs = list()
            for j,con in enumerate(context):
                if con == token:
                    _csvecs = [pad]*tmax
                    continue
                _csvecs = sense_vec(con,model) # sense vectors for current context
                _csvecs = np.asarray(_csvecs)
                adder = np.asarray([pad]*(tmax-len(_csvecs)))
                if len(adder)>0:
                    _csvecs = np.concatenate((_csvecs, adder), axis=0)
                csvecs.append(_csvecs)  
        else:
            tsvecs = [pad]*tmax
            csvecs = [[pad]*tmax]*(WINDOW*2)
        csvecs = csvecs + [[pad]*tmax]*((WINDOW*2)-len(csvecs)) 
        assert len(csvecs)==WINDOW*2
        token_vecs.append(tsvecs)
        token_con_vecs.append(csvecs)
    assert np.asarray(token_vecs).ndim == 3, 'token_vecs is not 3D, with shape:'\
    +str(np.asarray(token_vecs).shape)+':'+str(tokens)
    return np.asarray(token_vecs),np.asarray(token_con_vecs)

In [15]:
tv,tcv =  get_vecs(tokens,WINDOW,model)

In [11]:
len(tv)

1697

In [None]:
np.asarray(tv[957]).shape

In [None]:
len(tv[0][0])

In [12]:
np.asarray(tv).shape

(1697, 60, 300)

In [None]:
for i,d in enumerate(tv):
    print(i)
    assert np.asarray(d).shape[0]==60,np.asarray(d).shape[0]
    assert np.asarray(d).shape[1]==300
    if np.asarray(d).ndim!=2:
        print(np.asarray(d).shape)

In [None]:
@jit(nopython=True)
def get_tags(token_vecs,token_con_vecs):
    #token_vecs,token_con_vecs = get_vecs(tokens,WINDOW,model)
    to_replace = np.full((token_vecs.shape[0]), -1)
    for i in range(token_vecs.shape[0]):
        maxi = 0
        tag = -1 #index in wordsense
        tvecs = token_vecs[i] #current token sense vecs
        # vecs for all context words for the current token
        for wvecs in token_con_vecs[i]:
            # current context word sense vecs
            for vec in wvecs:
                if np.sum(vec)==0:
                    break
                for j,tvec in enumerate(tvecs):
                    if np.sum(tvec)==0:
                        break
                    norm = np.linalg.norm(tvec) * np.linalg.norm(vec)
                    sim = np.dot(tvec,vec)/ norm
                    if sim>maxi:
                        maxi = sim
                        tag = j 
        to_replace[i]=tag
    return to_replace

In [None]:
def tokenise(tokens,WINDOW,model):
    token_vecs,token_con_vecs = get_vecs(tokens,WINDOW,model)
    to_replace = get_tags(token_vecs,token_con_vecs)
    return to_replace

In [None]:
%timeit to_replace = tokenise(tokens,3,model)

In [None]:
to_replace

In [None]:
for i,v in enumerate(to_replace):
    if v !=-1:
        tokens[i]+='#'+str(i)

# padding

In [None]:
import itertools

def find_shape(seq):
    try:
        len_ = len(seq)
    except TypeError:
        return ()
    shapes = [find_shape(subseq) for subseq in seq]
    return (len_,) + tuple(max(sizes) for sizes in itertools.zip_longest(*shapes, fillvalue=1))

In [None]:
def traversal(a):
    '''
    parameter:
    ---------
    a: a multidimentional list
    
    returns:
    --------
    list
    
    a single dimention list containing 
    dimentions of the provided list without 
    the first dimention.
    '''
    m = 0
    level = list()
    for i,branch in enumerate(a):
        #print(branch)
        try:
            #_m = get_max(branch)
            _m = len(branch)
            if _m > m:
                m = _m
            for _branch in branch:
                level.append(_branch)
        except:
            return []
    return [m]+traversal(level)

def get_dims(a):
    '''
    parameter:
    ---------
    a: a multidimentional list
    
    returns:
    --------
    _d: (list)
    
    a single dimention list containing 
    dimentions of the provided list.
    '''
    _d = traversal(a)
    _d = [len(a)]+_d
    return _d

def get_padded(a):
    '''
    Pads a multidimentional list with constant.
    **Note: asssumes the last dimention symmetric. 
    parameter:
    ---------
    a: a multidimentional list
    constant: a constant number provided
    
    returns:
    --------
    a single dimention list containing 
    dimentions of the provided list.
    '''
    dims = get_dims(a)
    depth = 0 #need global depth counter
    def padder(a,prev_depth=0,dat=list(),data=list(),deff=1,total=0):
        #global depth
        for i,branch in enumerate(a):
            if type(branch)==list:
                total=len(a)
                get_padded.depth+=1
                print('i:',i)
                print('depth:',get_padded.depth)
                print('branch:',branch)
                if deff>1 and get_padded.depth!=prev_depth:
                    deff*=dims[get_padded.depth]
                    print('deff_a:',deff)
                if len(branch)<dims[get_padded.depth]:
                    deff*=dims[get_padded.depth]
                    print('deff_b:',deff)
                print('prev_depth',prev_depth)
                prev_depth = get_padded.depth
                padder(branch,prev_depth,dat,data,deff,total) #### 
            else:
                #leaf
                print('leaf:',branch)
                dat.append(branch) 
        else:
            get_padded.depth-=1
            print('inf dep:',get_padded.depth)
            print('dat:',dat)
            if not dat:
                print('inf dep ret:',get_padded.depth)
                print('-----')
                return
            if i == total-1:
                data.extend(dat+[0]*(deff-len(dat)))
                print('data:',data)
                dat = []
                deff = 1
                print('-----')
    padder(a)
    return np.reshape(data,dims)

In [None]:
a = [
    [[[1, 2, 3]],[[1, 2, 3], [4, 5, 6]]],
    [[[1, 2, 3], [4, 5, 6]]]
    ]
_a = [1, 2, 3, 0, 0, 0, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0]
np.reshape(_a,(2,2,2,3))

In [None]:
'''
print(tokens[0])
tv,tcv = get_vecs(tokens,3,model)
tv = np.asarray(tv)
print(tv.shape)
'''

a = [
    [[[1, 2, 3]],[[1, 2, 3], [4, 5, 6]]],
    [[[1, 2, 3], [4, 5, 6]]]
    ]

dims = [2,2,2,3]
#dims = [d*dims[i-1] for i,d in enumerate(dims) if i>0]

out = list()
tlen = 1
dat = list()
data = list()

depth = 0
prev_depth = 0

def padder(a):
    global tlen
    global out
    global depth
    global prev_depth
    global data
    global dat
    for i,branch in enumerate(a):
        if type(branch)==list:
            depth+=1
            print('i:',i)
            print('depth:',depth)
            print('prev_depth:',prev_depth)
            if depth<prev_depth:
                tlen=dims[depth]
                print('tlen:',tlen)
            print('branch:',branch)
            # check if same level
            if prev_depth < depth:
                tlen*=dims[depth]
                print('tlen:',tlen)
            prev_depth = depth
            padder(branch)
        else:
            #leaf
            dat.append(branch)
    print('dat:',dat)
    depth-=1
    print('reduced depth:',depth)
    if depth == 0:
        data.append(dat+[0]*(12-len(dat)))
        print(data)
        dat = []
    print('-----')
padder(a)