#### Get text and duration dataframe

In [1]:
import torch
import os
import pandas as pd
import numpy as np
from navec import Navec
from slovnet.model.emb import NavecEmbedding
from typing import Union

In [2]:
data_path = '/home/viktor/Projects/Data/MagistracyDeploma/'
test_path = data_path + 'crowd_test/'
raw_crowd_test_path = test_path + 'raw_crowd_test.tsv'
raw_crowd_test = pd.read_csv(raw_crowd_test_path ,sep='\t')
text_column_name = 'speaker_text'
duration_column_name = 'duration'
duration_text = raw_crowd_test[[text_column_name, duration_column_name]]
# duration_text.info()
duration_text = duration_text.dropna()
duration_text.head()

Unnamed: 0,speaker_text,duration
0,я слушаю,5.82
1,каким стал сбер,3.7
2,где родился шерлок холмс,4.38
3,открой в браузере ennio morricone,8.58
4,каким стал сбер,3.7


In [3]:
duration_text.shape

(77833, 2)

#### Word embeddings

In [4]:
data_folder = '/home/viktor/Projects/Data/MagistracyDeploma/TestHypotheses/'
file_name = 'navec_hudlit_v1_12B_500K_300d_100q.tar'  # 51MB
path = os.path.join(data_folder, file_name)

navec = Navec.load(path)  # ~1 sec, ~100MB RAM

words = ['навек', '<unk>', '<pad>']
ids = [navec.vocab[_] for _ in words]

emb = NavecEmbedding(navec)
vocab = navec.vocab
input = torch.tensor(ids)

emb(input).shape  # 3 x 300


  torch.from_numpy(navec.pq.indexes),


torch.Size([3, 300])

In [5]:
# emb(torch.tensor(navec.vocab['яма']))
def word_to_emb(emb, vocab, word_or_words: Union[str, list[str]]) -> torch.Tensor:
    def single_word_to_emb(emb, vocab, word):
        return emb(torch.tensor(vocab[word]))
    if type(word_or_words) == list:
        result = []
        for word in word_or_words:
            try:
                word_emb = single_word_to_emb(emb, vocab, word)
            except KeyError:
                continue
            result.append(word_emb)
        return result
    else:
        return single_word_to_emb(emb, vocab, word_or_words)

example_word = 'кот'
default_embedding_size = word_to_emb(emb, vocab, example_word).shape
word_to_emb(emb, vocab, example_word)
# word_to_emb(emb, vocab, ['витя', 'ваня'])

tensor([-0.4161, -0.8234,  0.1041,  0.2171, -0.1972, -0.2072,  0.4124,  0.3353,
         0.4763, -0.1746, -0.3656,  0.6460, -0.2359, -0.4967,  0.2351,  0.0293,
         0.5579, -0.1821,  0.3460, -0.4691, -0.1072, -0.2880, -0.1078,  0.0996,
        -0.4523,  0.4564,  0.6884, -0.1146, -0.0627, -0.2884,  0.3378, -0.2925,
         0.5172,  0.8344, -0.2078, -0.0337, -0.0421,  0.2375,  0.3722,  0.1258,
         0.1039, -0.2675, -0.1411,  0.1203, -0.4903, -0.0273, -0.0957, -0.0155,
         0.3530, -0.0187, -0.2534,  0.0073, -0.1871,  0.2632,  0.0475, -0.4049,
        -0.0334, -0.0777,  0.2896,  0.1553, -0.1509, -0.3095,  0.1722, -0.1822,
        -0.0854, -0.1743,  0.2572,  0.0155,  0.3648,  0.0846,  0.1715, -0.3526,
         0.7443,  0.0164,  0.5548, -0.1972,  0.3350, -0.0060, -0.2826,  0.1762,
        -0.2713,  0.0669, -0.3925,  0.2355,  0.2635,  0.2953, -0.6040,  0.1477,
         0.0168, -0.6317,  0.0766, -0.4718, -0.2756,  0.3199, -0.2244, -0.0384,
         0.2538, -0.1047,  0.2621,  0.05

In [6]:
def bag_of_words(emb, vocab, words: list[str], embedding_size = default_embedding_size) -> torch.Tensor:
    word_embeddings = word_to_emb(emb, vocab, words)
    vector_sum = torch.zeros(embedding_size)
    # if len(word_embeddings) > 0:
    for word in word_embeddings:
        vector_sum += word
    
    return vector_sum / len(word_embeddings)

bag_of_words(emb, vocab, ['кот', 'телефон']).shape

torch.Size([300])

In [7]:
text_bags_of_words = []
for text in duration_text['speaker_text']:
    words = text.lower().split(' ')
    text_bags_of_words.append(bag_of_words(emb, vocab, words))

In [8]:
# len(text_bags_of_words[0])
# text_bags_of_words = torch.stack(text_bags_of_words)
# text_bags_of_words.shape

In [9]:
# text_bags_of_words = pd.DataFrame.from_records(text_bags_of_words)

In [21]:
text_bags_of_words_df = pd.DataFrame(text_bags_of_words, dtype='float')
text_bags_of_words_df.head()

  subarr = np.array(arr, dtype=dtype, copy=copy)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.197958,-0.149894,0.032033,-0.102101,-0.219139,-0.263596,-0.225147,0.25893,0.117051,0.100377,...,-0.068739,0.077948,-0.084094,0.1139,-0.028043,-0.415415,0.147565,-0.205027,-0.019849,0.055699
1,0.062094,-0.474676,-0.409619,0.015491,-0.104335,-0.269172,-0.068685,0.292174,-0.271988,-0.157699,...,-0.213001,-0.233934,-0.219408,-0.407954,0.156611,-0.03621,-0.002245,-0.142691,-0.161408,0.285355
2,-0.199201,-0.20197,-0.107651,0.102246,0.001738,-0.182565,-0.134217,-0.035429,0.11974,-0.133304,...,-0.142165,-0.002077,-0.073691,-0.169712,0.187998,-0.286952,0.018901,-0.215588,-0.194586,-0.00022
3,0.136984,-0.363209,-0.340083,0.231929,-0.179154,0.08178,0.023477,0.386594,0.389532,-0.062063,...,-0.29422,0.183978,0.257248,-0.128505,0.063109,-0.345005,0.086596,0.171087,0.158903,-0.257887
4,0.062094,-0.474676,-0.409619,0.015491,-0.104335,-0.269172,-0.068685,0.292174,-0.271988,-0.157699,...,-0.213001,-0.233934,-0.219408,-0.407954,0.156611,-0.03621,-0.002245,-0.142691,-0.161408,0.285355


In [22]:
text_bags_of_words_df.shape

(77833, 300)