In [1]:
# -*- coding: utf-8 -*-
__author__ = 'nadya_motina'

import time
import codecs
import pandas
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
from gensim.models import Word2Vec
from itertools import combinations
from transliterate import translit

In [55]:
def process_word(i, data):
    # extracts necessary data from the table and make a readable word for word2vec model
    # input: index, table
    # output: "word_S"
    pos = data['PoS'][i]
    ipm = float(data['Freq(ipm)'][i])
    lemma = (data['Lemma'][i] + '_' + pos.upper()).decode('utf8')
    return lemma

def build_rng(word, model, topn=10):
    # 1) find N most similar words from the model
    # 2) for each combination of 2 words find if there is some other neighbor in between them
    #    * if there is no such word: add an edge
    # output: graph object
    wg = nx.DiGraph()
    neighbors = model.most_similar(word, topn=topn)
    neighbors.append((word, 1.0))
    for pair in combinations(neighbors, 2):
        word0 = pair[0][0]
        word1 = pair[1][0]
        similarity = model.similarity(word0, word1)
        if similarity > 0:
            remaining = [vertex for vertex in neighbors if vertex != pair[0] \
                         and vertex != pair[1]]
            for vertex in remaining:
                drawedge = True
                candidate = vertex[0]
                if model.similarity(candidate, word1) > similarity \
                and model.similarity(candidate, word0) > similarity:
                    drawedge = False
                    break
                if drawedge == True:
                    w1 = translit(pair[0][0], 'ru', reversed=True) 
                    w2 = translit(pair[1][0], 'ru', reversed=True)
                    wg.add_edge(w1, w2)#, cos_sim=similarity)
    return wg.to_undirected(), translit(word, 'ru', reversed=True)
 
def draw_rng(wg):
    print nx.info(wg)
    plt.figure(figsize=(5,5))
    plt.axis('off')
    pos = nx.circular_layout(wg)
    nx.draw_networkx(wg, pos=pos, with_labels=True, node_size=100 ,font_size=16,\
                     node_shape='o', alpha=0.3, node_color='blue')

In [3]:
start_time = time.time()
model = Word2Vec.load_word2vec_format('../../Diplom/models/ruscorpora.model.bin', binary=True)
print 'Model loaded successfully. It took ', time.time() - start_time, ' seconds'

Model loaded successfully. It took  15.6495559216  seconds


In [10]:
infile = codecs.open('../Lyashevskaya/freqrnc2011.csv', 'r', 'utf8')
data = pandas.read_csv(infile, sep='\t')
infile.close()
data.head()

Unnamed: 0,Lemma,PoS,Freq(ipm),Doc
0,быть,v,12160.7,34184
1,год,s,3727.5,29477
2,мочь,v,2912.3,25413
3,человек,s,2723.0,20423
4,сказать,v,2396.6,15426


**Теперь**
1. для начала для каждого слова из словаря найти ближайшие k соседей с помощью most_similar (пусть 100) и сохранить в список.
2. затем из каждого такого списка построить свой RNG
3. путем анализа таких RNG определять - сколько значений у данного слова

In [56]:
graph, node = build_rng('мир_N'.decode('utf8'), model, topn=100)
draw_rng(graph)
# nx.write_graphml(graph, "test.graphml", encoding='utf-8')
print nx.clustering(graph)[node]

KeyError: u"word '\u043c\u0438\u0440_N' not in vocabulary"

In [62]:
features = {}

for i in range(100):
    word = process_word(i, data)
    try:
        wg, node = build_rng(word, model, topn=100)
        #connectivity = nx.average_node_connectivity(wg)
        features[word] = nx.clustering(wg)[node]
    except: # Word not in model
        pass

show word-graphs with **high clustering** of the initial word

In [70]:
for i in features:
    c = features[i]
    if c >= 0.55:
        print i, '\t', c

вопрос_S 	0.600308641975
час_S 	0.574565883555
русский_A 	0.585225225225
земля_S 	0.580028666985
спросить_V 	0.73281075028
последний_A 	0.656438969765
стоить_V 	0.554341736695
жизнь_S 	0.557337610265
часть_S 	0.611717171717
считать_V 	0.704591265398


show word-graphs with **low clustering** of the initial word

In [68]:
for i in features:
    c = features[i]
    if c < 0.25:
        print i, '\t', c

сидеть_V 	0.226666666667
глаз_S 	0.243258749283
высокий_A 	0.206625258799
друг_S 	0.123563218391
жить_V 	0.248013090229
слово_S 	0.179964539007
увидеть_V 	0.199195171026
конец_S 	0.180774032459
голова_S 	0.243677375256
машина_S 	0.147147147147
право_S 	0.195378631892
женщина_S 	0.177777777778
