In [1]:
import pymorphy2
import torch

import youtokentome as yttm
import networkx as nx

from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

In [2]:
class Morpher:
    def __init__(self, pymorphy_morpher):
        self.morpher = pymorphy_morpher
        self.cash = dict()
        
    def analyze(self, word):        
        if word in self.cash:
            return self.cash[word]
        else:
            full_info = self.morpher.parse(word)[0]
            lemma = full_info.normal_form
            pos = full_info.tag.POS
#             return {'lemma' : lemma, 'pos' : pos}
            return lemma, pos

In [3]:
def make_graph(corpus):
    edges = []
    for text in corpus:
        for token in text:
            for neighbour in text:
                edge = [token2idx[token], token2idx[neighbour]]
                if edge != [] and edge[0] != edge[1]:
                    edges.append(edge)
                    
    x = torch.tensor(list(idx2token.keys()))
    edge_index = torch.tensor(edges).transpose(0, 1)
    graph = Data(x = x, edge_index = edge_index)
    
    return graph

In [4]:
morph = pymorphy2.MorphAnalyzer()
morph = Morpher(morph)

In [5]:
filepath = 'data/Bunin-Temnye_allei.txt'

In [6]:
with open(filepath, 'r', encoding='cp1251') as f:
    corpus_raw = f.read().lower()

In [7]:
corpus_raw = corpus_raw.split('\n')

In [8]:
unique_tokens = set()
corpus = []
for text in corpus_raw:
    text = text.split()
    clear = []
    for token in text:
        lemma, pos = morph.analyze(token)
        if pos in ['NOUN', 'INFN', 'VERB', 'ADJF']:
            unique_tokens.add(lemma)
            clear.append(lemma)
    if clear != []:
        corpus.append(clear)

In [9]:
corpus = corpus[:100]

In [10]:
token2idx = dict(zip(unique_tokens, range(len(unique_tokens))))
idx2token = dict(zip(range(len(unique_tokens)), unique_tokens))

In [11]:
vocab_size = max(idx2token.keys())+1

In [12]:
train_corpus, test_corpus = train_test_split(corpus, test_size=0.2)

In [13]:
train_graph, test_graph = make_graph(train_corpus), make_graph(test_corpus)

In [14]:
torch.save(token2idx, 'token2idx.pickle')
torch.save(idx2token, 'idx2token.pickle')

In [15]:
torch.save(train_graph, 'train.pickle')
torch.save(test_graph, 'test.pickle')

In [16]:
# G = nx.Graph(train_graph['edge_index'].tolist())

In [17]:
# nx.write_gexf(G, "test.gexf")

In [18]:
# nx.draw_networkx(G)