In [1]:
import gensim
import re
import numpy as np
from nltk import ngrams

In [2]:
t_model = gensim.models.Word2Vec.load('full_grams_sg_300_twitter.mdl')

In [4]:
# =========================
# ==== Helper Methods =====

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

def get_vec(n_model,dim, token):
    vec = np.zeros(dim)
    is_vec = False
    if token not in n_model.wv:
        _count = 0
        is_vec = True
        for w in token.split("_"):
            if w in n_model.wv:
                _count += 1
                vec += n_model.wv[w]
        if _count > 0:
            vec = vec / _count
    else:
        vec = n_model.wv[token]
    return vec

def calc_vec(pos_tokens, neg_tokens, n_model, dim):
    vec = np.zeros(dim)
    for p in pos_tokens:
        vec += get_vec(n_model,dim,p)
    for n in neg_tokens:
        vec -= get_vec(n_model,dim,n)
    
    return vec   

## -- Retrieve all ngrams for a text in between a specific range
def get_all_ngrams(text, nrange=3):
    text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
    tokens = [token for token in text.split(" ") if token.strip() != ""]
    ngs = []
    for n in range(2,nrange+1):
        ngs += [ng for ng in ngrams(tokens, n)]
    return ["_".join(ng) for ng in ngs if len(ng)>0 ]

## -- Retrieve all ngrams for a text in a specific n
def get_ngrams(text, n=2):
    text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
    tokens = [token for token in text.split(" ") if token.strip() != ""]
    ngs = [ng for ng in ngrams(tokens, n)]
    return ["_".join(ng) for ng in ngs if len(ng)>0 ]

## -- filter the existed tokens in a specific model
def get_existed_tokens(tokens, n_model):
    return [tok for tok in tokens if tok in n_model.wv ]

In [5]:
token = clean_str(u'ابو تريكه').replace(" ", "_")

In [6]:
if token in t_model.wv:
    most_similar = t_model.wv.most_similar( token, topn=10 )
    for term, score in most_similar:
        term = clean_str(term).replace(" ", "_")
        if term != token:
            print(term, score)

ابوتريكه 0.8353127241134644
تريكه 0.742644727230072
حازم_امام 0.6797752380371094
حسام_حسن 0.6696128845214844
شيكابالا 0.6619654893875122
عمرو_زكي 0.6597729921340942
الزمالك 0.654998779296875
باسم_مرسي 0.6479896306991577
عماد_متعب 0.6467376947402954
وائل_جمعه 0.6452205181121826


  if np.issubdtype(vec.dtype, np.int):


In [9]:
type(t_model.wv)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [16]:
t_model.wv.save("w2v.kv")

In [22]:
t_model.wv.save_word2vec_format("w2v.bin",binary=True)

In [35]:
with open('res/part-00000-cead8de6-cc72-4ab1-b66b-7356e9ef8964-c000.csv','r') as input_file:
    l = input_file.readlines()
    for i in l:
        print(u"{}".format(i))

￙ﾆ￙ﾈ￘ﾱ￙ﾈ￘ﾲ

￘ﾧ￙ﾄ￘ﾳ￙ﾄ￘ﾧ￙ﾅ￙ﾇ￘ﾧ￘ﾯ￙ﾊ

￘ﾧ￙ﾄ￘ﾪ￘ﾧ￙ﾊ￙ﾄ￙ﾆ￘ﾯ￙ﾊ￘ﾧ￘ﾪ

￘ﾲ￙ﾁ￙ﾊ￘ﾱ

￙ﾆ￙ﾆￚﾯ

￘ﾷ￘ﾱ￘ﾯ_￘ﾧ￙ﾄ￘ﾨ￘ﾹ￘ﾫ￙ﾇ

￯ﾻﾗ￯ﾻﾠ￯ﾻﾮ￯ﾺﾑ￯ﾺﾎ

￙ﾆ￙ﾈ￘ﾱ_￘ﾧ￙ﾄ￙ﾂ￙ﾅ￘ﾱ

￙ﾈ￘ﾨ￘ﾱ￘ﾧ￘ﾦ￘ﾪ￙ﾃ

￙ﾈ￘ﾧ￙ﾈ￘ﾬ￘ﾹ￘ﾪ￙ﾆ￙ﾊ

#_￘ﾧ￙ﾂ￘ﾧ￙ﾄ￙ﾇ_￘ﾯ￙ﾈ￙ﾆ￙ﾊ￘ﾳ

￘ﾨ￘ﾪ￘ﾵ￘ﾨ￘ﾱ￙ﾊ￙ﾆ

￙ﾈ￘ﾨ￘ﾭ￘ﾯ￘ﾯ

￘ﾧ￘ﾪ￘ﾯ￙ﾄ￙ﾄ￙ﾊ

￙ﾈ￘ﾪ￘ﾨ￘ﾧ

￙ﾈ￘ﾧ￘ﾭ￙ﾊ￙ﾊ￘ﾪ￙ﾆ￙ﾊ

￘ﾧ￙ﾄ￘ﾨ￘ﾱ￙ﾊ￙ﾅ￙ﾊ￘ﾱ￙ﾄ￙ﾊ￙ﾂ

￢ﾖﾪ￯ﾸﾏ￢ﾚﾪ￯ﾸﾏ

￙ﾆ￘ﾳ￘ﾨ￘ﾪ￙ﾊ

￙ﾂ￘ﾱ￙ﾊ￘ﾨ￙ﾈ￘ﾧ￙ﾅ￘ﾳ￘ﾭ

￙ﾈ￘ﾨ￙ﾄ￘ﾧ￙ﾇ￘ﾧ

￘ﾧ￙ﾂ￘ﾪ￘ﾭ￘ﾧ￙ﾅ_￘ﾧ￙ﾄ￙ﾅ￙ﾆ￘ﾧ￘ﾲ￙ﾄ

￘ﾧ￘ﾧ￙ﾄ￙ﾂ￙ﾊ￘ﾧ￙ﾅ￙ﾇ￙ﾇ

￙ﾈ￘ﾧ￘ﾲ￙ﾇ￙ﾂ

￘ﾪ￙ﾄ￘ﾭ￙ﾁ￙ﾈ

￙ﾊ￘ﾭ￘ﾧ￙ﾃ

￘ﾧ￘ﾨ￘ﾧ￙ﾄ￙ﾇ￘ﾧ

^￙ﾈ￘ﾧ￙ﾆ￘ﾪ￙ﾊ

￘ﾵ￘ﾯ￘ﾱ￙ﾅ￙ﾆ

￘ﾳ￙ﾊ￙ﾃ￘ﾳ￘ﾨ

￘ﾹ￙ﾅ￘ﾱ_￘ﾹ￘ﾨ￘ﾯ_￘ﾧ￙ﾄ￙ﾃ￘ﾧ￙ﾁ￙ﾊ

￙ﾂ￘ﾨ￘ﾧ￘ﾧ￘ﾦ￙ﾄ

￙ﾅ￘ﾨ￘ﾱ￘ﾨ￘ﾱ

￙ﾆ￘ﾺ￙ﾅ￙ﾇ_￙ﾆ￘ﾴ￙ﾊ￘ﾯ

￘ﾧ￙ﾅ￘ﾱ￙ﾊ￙ﾃ￘ﾧ￢ﾀﾎ

￘ﾧ￙ﾄ￘ﾬ￙ﾅ￘ﾹ￙ﾇ_￙ﾁ￘ﾧ￙ﾃ￘ﾪ￘ﾨ_￙ﾄ￙ﾆ￘ﾧ

￙ﾃ￘ﾧ￙ﾆ￙ﾈ￘ﾧ_￘ﾨ￙ﾇ_￙ﾊ￘ﾳ￘ﾪ￙ﾇ￘ﾲ￘ﾦ￙ﾈ￙ﾆ

￘ﾭ￙ﾁ￘ﾸￚﾩ

￘ﾧ￙ﾄ￘ﾧ￘ﾳ￘ﾪ￘ﾺ￙ﾁ￘ﾧ￘ﾱ￢ﾙﾡ

￘ﾨ￘ﾧ￙ﾄ￘ﾳ￘ﾹ￙ﾈ￯﾿ﾽ

￘ﾱ￙ﾃ￘ﾹￛﾁ_￙ﾂ￘ﾯ

￙ﾈ￘ﾶ￙ﾊ￘ﾹ￘ﾪ￙ﾇ

￘ﾭ￙ﾊ￙ﾆ￰ﾟﾒﾛ

￘ﾨ￘ﾪ￙ﾆ￘ﾪ￙ﾁ

￘ﾨ￘ﾪ￘ﾹ￙ﾅ￙ﾄ￙ﾊ￙ﾄ￙ﾊ

￘ﾨ￘ﾧ￙ﾄ￘ﾸ￙ﾇ￘ﾱ￘ﾧ￙ﾆ

￢ﾜﾋ￳ﾾﾌﾴ

￘ﾧ￘ﾱ￙ﾈ￙ﾊ￘ﾧ

￙ﾅ￘ﾧ￘ﾪ￙ﾁ￘ﾪ￙ﾊ￘ﾴ

￙ﾆ￙ﾁ￘ﾳ￙ﾊ_￘ﾷ￘ﾱ￙ﾁ￙ﾇ

￙ﾅ￘ﾨ￙ﾂ￙ﾈ￙ﾂ￙ﾇ

￘ﾧ￙ﾄ￘ﾺ￘ﾧ￘ﾨ￘ﾧ￘ﾪ

￙ﾈ￙ﾊ￘ﾯ￙ﾈ￘ﾨ

￘ﾧ￙ﾄ￘ﾨ￙ﾊ￘ﾧ￘ﾲ￙ﾊ￙ﾆ

￙ﾄ￙ﾃ￘ﾱ￙ﾇ_￘ﾧ￙ﾄ￙ﾂ￘ﾯ￙ﾅ_￘ﾨ￘ﾧ￙ﾄ￙ﾆ￘ﾧ￘ﾯ￙ﾊ

￙ﾊ￘ﾶ￘ﾱ￙ﾈￚﾯ

￰ﾟﾑﾌ￰ﾟﾏﾻ

￙ﾈ￙ﾃ￘ﾰ￙ﾇ

￙ﾈ￘ﾱ￘ﾧ￙ﾅ￙ﾇ

￘ﾧ￙ﾄ￙ﾈ￘ﾷ￙ﾆ_￘ﾨ￙ﾄ￘ﾳ

￢ﾀﾹ￢ﾘﾺ>

￘

In [32]:
n_model = gensim.models.KeyedVectors.load_word2vec_format("w2v.bin",binary=True,encoding='utf-8')

In [40]:
if token in n_model.wv:
    most_similar = n_model.wv.most_similar( token, topn=10 )
    for term, score in most_similar:
        term = clean_str(term).replace(" ", "_")
        if term != token:
            m=term.encode('utf-8')
            print(m.decode('unicode'))
            print(term, score)

  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  if np.issubdtype(vec.dtype, np.int):


LookupError: unknown encoding: unicode