In [2]:
import numpy as np
import pickle as pkl
import gzip
from gensim.models.word2vec import Word2Vec
import gensim
from itertools import islice
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from conllu.parser import parse



In [3]:
w2v_model = gensim.models.Word2Vec.load_word2vec_format('./wikiru.bin', binary=True)

In [3]:
from itertools import islice
list(islice(w2v_model.vocab, 20000, 20210))

['лафета\xa0—',
 'мартыном',
 'суламиты',
 'bit»',
 'семиосоциопсихологии',
 'полнометражном',
 'будерим',
 'попахивает',
 'краям»',
 'тилия',
 'саo',
 '\xa0якушкиным',
 'цервикальная',
 'кловеp',
 'оказавшемуся',
 'приплывёт',
 'dnow',
 'рабельо',
 'сибутрамина',
 'прерывая',
 'kg/default',
 'ксизовского',
 'вьючная',
 'проследовала',
 'псикосмологию',
 'овендена',
 'королёве»',
 'ариевич',
 'премьерному',
 'всеволодово',
 'аддукция',
 'fingertips',
 'колитах',
 'нейронов\xa0—',
 'понтификов',
 'вихревой',
 'камийена',
 'amsterdam»\xa0—',
 'schneider»',
 'боннелл',
 'лыжин',
 'окислов',
 'пикве',
 'organy',
 'понятийной',
 'ужаснейших',
 'испаряться',
 'разбойничавших',
 'аналити́ческая',
 'стеффани',
 'nationalencyklopedin',
 'торжествовали',
 'кащее',
 'чего…',
 'нельзя…',
 'насаждений\xa0—',
 'coast»',
 'топирамат',
 'тримарана',
 'юрьянском',
 'тюлевый',
 'отдела\xa0—',
 'апеннинских',
 'строить\xa0—',
 'возобновятся',
 'скорости,',
 'металлика»',
 'да́угава',
 'вахмана',
 'рубэцу

In [4]:
pos_tags = {'A': 'A',
 'ADV': 'ADV',
 'COM': 'COM',
 'CONJ': 'CONJ',
 'ENG': 'UNKN',
 'INTJ': 'INTJ',
 'MAD': 'UNKN',
 'MID': 'UNKN',
 'NUM': 'NUM',
 'PART': 'PART',
 'PR': 'PR',
 'S': 'S',
 'UNKNW': 'UNKN',
 'V': 'V',
 'ВИН': 'UNKN',
 'ЖЕН': 'UNKN',
 'ИМ': 'UNKN',
 'МУЖ': 'UNKN',
 'ПР': 'UNKN',
 'РОД': 'UNKN',
 'СРЕД': 'UNKN',
 'ТВОР': 'UNKN',
 'мд2': 'UNKN',
 'мн17': 'UNKN',
 'х5': 'UNKN'}


In [5]:
label2Idx = {'LOC': 0, 'MISC': 3, 'O': 3, 'ORG': 1, 'PER': 2}

In [6]:
label2Idx

{'LOC': 0, 'MISC': 3, 'O': 3, 'ORG': 1, 'PER': 2}

In [7]:
def get_sentences(feature_data):
    sentences = []
    sentence = []
    i = 0
    feature_data = feature_data.fillna(0)
    for row in feature_data.iterrows():
        if row[1]['lemma'] == 0:
            continue
        if row[1]['isFirstWord'] == 1 and i != 0:
            sentences.append(sentence)
            sentence = []
        i = 1
        sentence.append(row[1])
        
    return sentences

In [8]:
def get_feature_data_ud(data):
    l = []
    labelSet= set()
    for sent in data:
        for ind, word in enumerate(sent):
            w = []
            labelSet.add(word['upostag'])
            w.append(word['upostag'])
            w.append(word['form'])
            w.append(word['lemma'])
            w.append(int(word['form'].isupper()))
            w.append(int(word['form'].istitle()))
            w.append(int(word['form'].islower()))
            w.append(int(word['form'].isdigit()))
            w.append(int(word['form'].isalpha()))
            if ind == len(sent)-2 or ind == len(sent)-1:
                w.append(1)
            else:
                w.append(0)
            if ind == 0:
                w.append(1)
            else:
                w.append(0)
            l.append(w)
    print('ok2')
    feature_data = pd.DataFrame(l, columns=['pos', 'forma', 'lemma', 'isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isLastWord', 'isFirstWord'])
    return feature_data, labelSet

In [9]:
def get_feature_data():
    folder = '../data/'
    feature_file = '_final.features.csv'
    target_file = '.targets.csv'
    files = ['Wiki_full/WikiNER_part1', 'Wiki_full/WikiNER_part2', 'Wiki_full/WikiNER_part3', \
        'Wiki_full/WikiNER_part4', 'Wiki_full/WikiNER_part5', 'Wiki_full/WikiNER_part6', 'Wiki_full/WikiNER_part7', \
        'Wiki_full/WikiNER_part8', 'Wiki_full/WikiNER_part9', 'Wiki_full/WikiNER_part10', 'Wiki_full/WikiNER_part11', \
        'Wiki_full/WikiNER_part12', 'Wiki_full/WikiNER_part13', 'Wiki_full/WikiNER_part14', 'Wiki_full/WikiNER_part15',    'Wiki_full/WikiNER_part16', 'Wiki_full/WikiNER_part17', 'Wiki_full/WikiNER_part18', 'Wiki_full/WikiNER_part19',    'Wiki_full/WikiNER_part20']
    feature_data = pd.DataFrame()
    for f in files:
        df_train_all = pd.read_csv(folder + f + feature_file, sep=';', usecols=['forma', 'pos','link', 'len', 'lemma', 'isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isalnum', 'isLastWord', 'isFirstWord', 'grm', 'posStartInText'])
        df_train_target = pd.read_csv(folder + f + target_file, sep=';')
        #df_train_t = df_train_all[['pos','link', 'len', 'lemma', 'isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isalnum', 'isLastWord', 'isFirstWord', 'grm', 'posStartInText']]
        df_train_t = pd.concat([df_train_all, df_train_target], axis=1)
        feature_data = pd.concat([feature_data, df_train_t], axis=0)
        del df_train_all, df_train_target, df_train_t
        print('ok')
    print('ok2')
    
    return feature_data

In [10]:
def get_dial_sentences():
    folder = '../data/'
    feature_file = '.features.csv'
    target_file = '.targets.csv'
    files = ['Dialog_test/NER_testset', 'Dialog_train/NER_devset']
    df_train_all = pd.read_csv(folder + files[1] + feature_file, sep=';')
    df_test_all = pd.read_csv(folder + files[0] + feature_file, sep=';')
    df_train_target = pd.read_csv(folder + files[1] + target_file, sep=';')
    df_test_target = pd.read_csv(folder + files[0] + target_file, sep=';')
    df_train_t = df_train_all[['forma', 'pos','link', 'len', 'lemma', 'isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isalnum', 'isLastWord', 'isFirstWord', 'grm', 'posStartInText', 'posStart']]
    df_test_t = df_test_all[['forma', 'pos','link', 'len', 'lemma', 'isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isalnum', 'isLastWord', 'isFirstWord', 'grm', 'posStartInText', 'posStart']]
    df_train = pd.concat([df_train_t, df_train_target], axis=1)
    df_test = pd.concat([df_test_t, df_test_target], axis=1)
    train_sentences_dial = get_sentences(df_train)
    test_sentences_dial = get_sentences(df_test)
    return train_sentences_dial, test_sentences_dial
    

In [11]:
def get_wiki_sentences():
    
    
    #files = ['Wiki_full/WikiNER_part1', 'Wiki_full/WikiNER_part2', 'Wiki_full/WikiNER_part3', \
    #    'Wiki_full/WikiNER_part4', 'Wiki_full/WikiNER_part5', 'Wiki_full/WikiNER_part6', 'Wiki_full/WikiNER_part7', \
    #    'Wiki_full/WikiNER_part8', 'Wiki_full/WikiNER_part9', 'Wiki_full/WikiNER_part10']

    feature_data = get_feature_data()
    X_train, X_test = train_test_split(feature_data, test_size=0.25, random_state=42)
    train_sentences = get_sentences(X_train)
    test_sentences = get_sentences(X_test)
    return train_sentences, test_sentences

In [12]:
def get_ud_sentences():
    folder = '../data/'
    files_ud = ['UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu', 'UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu', \
           'UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu']
    data = []
    for fl in files_ud:
        ff = open(folder + fl, "r")
        data = data + parse(ff.read())
        ff.close()
    print(len(data))
    feature_data_ud, labelSet = get_feature_data_ud(data)
    print(labelSet)
    X_train_ud, X_test_ud = train_test_split(feature_data_ud, test_size=0.25, random_state=42)
    train_sentences_ud = get_sentences(X_train_ud)
    test_sentences_ud = get_sentences(X_test_ud)
    return train_sentences_ud, test_sentences_ud

In [13]:
train_sentences_dial, test_sentences_dial = get_dial_sentences()

In [14]:
train_sentences_ud, test_sentences_ud = get_ud_sentences()

60551
ok2
{'DET', 'CONJ', 'NOUN', 'ADV', 'VERB', 'ADJ', 'PROPN', 'PRON', 'ADP', 'INTJ', 'PART', 'SCONJ', 'NUM', 'SYM', 'X', 'PUNCT', 'AUX'}


In [15]:
train_sentences_wiki, test_sentences_wiki = get_wiki_sentences()

ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok2


In [16]:
label_ud2Idx = {'VERB':0, 'PART':1, 'NUM':2, 'X':3, 'INTJ':4, 'NOUN':5, 'DET':6, 'PUNCT':7, 'CONJ':8, 'PROPN':9, 'ADJ':10, 'PRON':11, 'SCONJ':12, 'SYM':13, 'AUX':14, 'ADV':15, 'ADP':16}

In [None]:
chars = set()
for dataset in [train_sentences_dial, test_sentences_dial]:
    for sentence in dataset:
        for token in sentence:
            for c in token['lemma']:
                chars.add(c)

In [None]:
chars = sorted(list(chars))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
charEmbeddings = []

In [None]:
words = {}

for dataset in [train_sentences_dial, test_sentences_dial]:
    for sentence in dataset:
        for token in sentence:
            words[token['lemma'].lower()+'_'+pos_tags[token['pos']]] = True

In [29]:
words_ud = {}

for dataset in [train_sentences_ud, test_sentences_ud, train_sentences_dial, train_sentences_wiki, test_sentences_wiki]:
    for sentence in dataset:
        for token in sentence:
            words_ud[token['forma'].lower()] = True

In [18]:
words = {}

for dataset in [train_sentences_ud, test_sentences_ud, train_sentences_dial, test_sentences_dial, train_sentences_wiki, test_sentences_wiki]:
    for sentence in dataset:
        for token in sentence:
            words[token['forma'].lower()] = True


In [19]:
word2Idx = {}
wordEmbeddings = []

In [20]:
len(words)

262570

In [32]:
vector = np.random.uniform(-0.25, 0.25, 300)
wordEmbeddings.append(vector)
bad_words = 0
for word in words:
    if word in w2v_model.vocab:# and word in words_ud:
        wordEmbeddings.append(w2v_model[word])
        word2Idx[word] = len(wordEmbeddings) - 1
    else:
        #print(word)
        bad_words = bad_words + 1
        word2Idx[word] = 0
print("bad_words:", bad_words)   

bad_words: 32250


In [22]:
caseEmbeddings = []
#caseFields = ['isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isalnum', 'isLastWord', 'isFirstWord']
caseFields = ['isupper', 'istitle', 'islower', 'isdigit', 'isalpha', 'isLastWord', 'isFirstWord']

In [23]:
def createMatrices_dial(sentences, word2Idx, label2Idx):
    dataset = []
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        labelIndices = []
        for token in sentence:  
            wordIdx = word2Idx[token['forma'].lower()]
            wordIndices.append(wordIdx)
            v = []
            for field in caseFields:
                v.append(token[field])
            caseEmbeddings.append(v)
            caseIndices.append(len(caseEmbeddings) - 1)
            labelIndices.append([label2Idx[token['mark']], token['posStartInText'], token['FileName'], token['len']])
        dataset.append([wordIndices, caseIndices, labelIndices]) 
    return dataset


In [24]:
def createMatrices_wiki(sentences, word2Idx, label2Idx):
    dataset = []
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        labelIndices = []
        for token in sentence:  
            wordIdx = word2Idx[token['forma'].lower()]
            wordIndices.append(wordIdx)
            v = []
            for field in caseFields:
                v.append(token[field])
            caseEmbeddings.append(v)
            caseIndices.append(len(caseEmbeddings) - 1)
            labelIndices.append([label2Idx[token['mark']]])
        dataset.append([wordIndices, caseIndices, labelIndices]) 
    return dataset

In [25]:
def createMatrices_ud(sentences, word2Idx, label2Idx):
    dataset = []
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        labelIndices = []
        for token in sentence:  
            wordIdx = word2Idx[token['forma'].lower()]
            wordIndices.append(wordIdx)
            v = []
            for field in caseFields:
                v.append(token[field])
            caseEmbeddings.append(v)
            caseIndices.append(len(caseEmbeddings) - 1)
            labelIndices.append([label_ud2Idx[token['pos']]])
        dataset.append([wordIndices, caseIndices, labelIndices]) 
    return dataset

In [26]:
def createMatrices1(sentences, word2Idx, label2Idx):
    dataset = []
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        labelIndices = []
        charIndices = []
        for token in sentence:  
            wordIdx = word2Idx[token['lemma'].lower()+'_'+pos_tags[token['pos']]]
            wordIndices.append(wordIdx)
            v = []
            for field in caseFields:
                v.append(token[field])
            w = np.zeros((len(chars)))
            for c in token['lemma']:
                #print(c)
                w[char_indices[c]] = 1
            charEmbeddings.append(w)
            charIndices.append(len(charEmbeddings) - 1)
            #v.append(link_words[token['link'].lower()])
            #v.append(pos_words[token['pos'].lower()])
            caseEmbeddings.append(v)
            caseIndices.append(len(caseEmbeddings) - 1)
            #labelIndices.append([label2Idx[token['mark']], token['posStartInText'], token['len']])
            labelIndices.append([label2Idx[token['mark']], token['posStartInText'], token['FileName'], token['len'], label_bio2Idx[token['BIO']]])
            #labelIndices.append(label2Idx[token['mark']])
        dataset.append([wordIndices, caseIndices, labelIndices, charIndices]) 
    return dataset


In [31]:
folder = '../data/'
outputFilePath = folder + 'data_dial_wiki_all_embed.pkl.gz'
embeddingsPklPath = folder + 'embeddings_dial_wiki_all_embed.pkl.gz'

test_set_dial = createMatrices_dial(test_sentences_dial, word2Idx, label2Idx)
train_set_dial = createMatrices_dial(train_sentences_dial, word2Idx, label2Idx)
test_set_ud = createMatrices_ud(test_sentences_ud, word2Idx, label_ud2Idx)
train_set_ud = createMatrices_ud(train_sentences_ud, word2Idx, label_ud2Idx)
test_set_wiki = createMatrices_wiki(test_sentences_wiki, word2Idx, label2Idx)
train_set_wiki = createMatrices_wiki(train_sentences_wiki, word2Idx, label2Idx)



#embeddings = {'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
#              'caseEmbeddings': caseEmbeddings, 'label2Idx': label2Idx,
#              'charEmbeddings': charEmbeddings, 'label_bio2Idx': label_bio2Idx}
embeddings = {'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
              'caseEmbeddings': caseEmbeddings, 'label2Idx': label2Idx,
              'label_ud2Idx': label_ud2Idx}

f = gzip.open(embeddingsPklPath, 'wb')
pkl.dump(embeddings, f, -1)
f.close()

f = gzip.open(outputFilePath, 'wb')
pkl.dump(test_set_dial, f, -1)
pkl.dump(train_set_dial, f, -1)
pkl.dump(test_set_ud, f, -1)
pkl.dump(train_set_ud, f, -1)
pkl.dump(test_set_wiki, f, -1)
pkl.dump(train_set_wiki, f, -1)

f.close()
