In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import wave
import sys
import soundfile as sf
# from pydub import AudioSegment
from scipy.io import wavfile
from dataclasses import (
    dataclass,
    asdict,
)
from typing import (
    Optional,
    Callable,
    Set,
    Generator,
    List,
    Tuple,
    Union,
    Dict,
)
import time
import dill
import logging
import multiprocessing
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import transformers
import torch
import pickle
import seaborn as sns
from enum import Enum
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
)
from sklearn.feature_extraction.text import TfidfVectorizer

# from bdw.check import Check

sys.path.append('..')
from audio import (
    Audio,
    WAVFilePathInitArgs,
)
from text.profanity import (
    PROFANITY_WORD_FILTER_LANG_NAME,
)
from configs.base import (
    RB_OPEN_FILE_MODE,
    SECONDS_QUANTITY_IN_MINUTE,
    TAB,
    RUSSIAN_VOWELS,
    WB_OPEN_FILE_MODE,
    DROP_DUPLICATES_KEEP_FIRST,
    JOIN_HOW_INNER,
)
from configs.paths import (
    DUSHA_CROWD_TRAIN_FILE_PATH,
    DUSHA_CROWD_TEST_FILE_PATH,
    DUSHA_CROWD_TRAIN_WAVS_DIR_PATH,
    DUSHA_CROWD_TEST_WAVS_DIR_PATH,
    PROCESSED_DUSHA_CROWD_TRAIN_HLF_LAST_VERSION_FILE_PATH,
    PROCESSED_DUSHA_CROWD_TEST_HLF_LAST_VERSION_FILE_PATH,
    DO_NOT_EXTRACTED_FEATUERS_HASHES_FILE_PATH,
)
from configs.report_tables_format import (
    classification_report_formatted,
)
from processing.text.normalization import (
    normalized_tokens_2_normalized_text,
    text_2_normalized_text,
)
from high_level_feature_extractor.text.profanity import (
    text_2_is_contain_swear_words,
)
from high_level_feature_extractor.text.all import (
    TranscriptionHighLevelFeatures,
)
from high_level_feature_extractor.extractor import (
    HighLevelSpeechFeatures,
    HashHLF,
    hash_HLF_list_2_df,
    PronounceSpeed,
)
from high_level_feature_extractor.extract import (
    raw_crowd_2_HLF,
)
from utils.dataclass import (
    flatten_dict,
)
from volume.human_speech import (
    HIGH_FREQUENCY_SPEECH_THRESHOLD,
)
from configs.paths import (
    PROCESSED_DUSHA_CROWD_TRAIN_DIR_PATH,
    PROCESSED_DUSHA_CROWD_TEST_DIR_PATH,
    PROCESSED_DUSHA_CROWD_TRAIN_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH,
    PROCESSED_DUSHA_CROWD_TEST_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH,
)
from models.config import (
    TORCH_TENSORS_KEYWOED,
    ATTENTION_MASK_KEYWORD,
)
from models.text_embedding.ru_en_RoSBERTa import (
    DEVICE as ROSBERTA_DEVICE,
    NORMALIZE_P as ROSBERTA_NORMALIZE_P,
    NORMALIZE_DIM as ROSBERTA_NORMALIZE_DIM,
    CLAMP_MIN,
)
from config import (
    SPEAKER_TEXT_FIELD_NAME,
)
from utils.parallel_processing import (
    divide_into_chunks,
)
from configs.datasets.dusha import (
    HASH_ID_COLUMN_NAME,
    GoldenEmo,
    SPEAKER_EMOTION_FIELD_NAME,
)
from processing.text.normalization import (
    text_to_normalized_tokens,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%load_ext autoreload
%autoreload all

# Extract

## HLF

In [6]:
EXAMPLE_AUDIO_PATH:Path = Path('/data01/vvkiselev/data/other/dpl/dusha/crowd/crowd_train/wavs/000039c2bc753aa5a776621a4707eb73.wav')

In [7]:
# audio_example:Audio = Audio.wav_file_path_init(path=EXAMPLE_AUDIO_PATH, transcription='ахах, пиздец')
arguments:WAVFilePathInitArgs = WAVFilePathInitArgs(path=EXAMPLE_AUDIO_PATH, transcription='ахах, пиздец')
audio_example:Audio = Audio.wav_file_path_init(arguments=arguments)
audio_example

Audio(hash='000039c2bc753aa5a776621a4707eb73', sample_width=2, sr=16000, n_frames=165120, data=array([ 0,  0,  0, ..., -2,  6, -9], dtype=int16), n_channels=1, _transcription='ахах, пиздец')

In [8]:
# HLF_example:HighLevelSpeechFeatures = HighLevelSpeechFeatures.wav_path_init(path=EXAMPLE_AUDIO_PATH, transcription='бля зачем')
HLF_example:HighLevelSpeechFeatures = HighLevelSpeechFeatures.audio_init(audio=audio_example)
HLF_example

type(tfidf_matrix) = <class 'scipy.sparse._csr.csr_matrix'>


HighLevelSpeechFeatures(loudness=59.77829826935232, HF_power_ratio=0.054469042401762625, pronounce_speed=PronounceSpeed(WPS=1.065891472868217, LPS=0.9689922480620154, SPS=0.38759689922480617), transcription_features=TranscriptionHighLevelFeatures(mean_words_length=5.0, profanity_words_ratio=0.5, meaning=4.454233000760066e-05))

In [11]:
raw_crowd_train = pd.read_csv(DUSHA_CROWD_TRAIN_FILE_PATH, sep=TAB)
print(raw_crowd_train.shape)
display(raw_crowd_train.head())

raw_crowd_test = pd.read_csv(DUSHA_CROWD_TEST_FILE_PATH, sep=TAB)
print(raw_crowd_test.shape)
display(raw_crowd_test.head())

(906953, 9)


Unnamed: 0,hash_id,audio_path,duration,annotator_emo,golden_emo,annotator_id,speaker_text,speaker_emo,source_id
0,475e76f77ac1ed7cabafca740b15b32a,wavs/475e76f77ac1ed7cabafca740b15b32a.wav,2.453,angry,,858305a5450b7bd1288ba0053b1cd1c1,не надо не надо не надо не надо,angry,fa136da095807ea6cd18dd6e2f58d4d0
1,2f9438ef68395c70a8714dc373a49d11,wavs/2f9438ef68395c70a8714dc373a49d11.wav,4.64,neutral,,858305a5450b7bd1288ba0053b1cd1c1,фозил кори mp три,neutral,3d436884cbbe25373914f8768de494f7
2,9937036a9c0dba20eecbffddd00f2be2,wavs/9937036a9c0dba20eecbffddd00f2be2.wav,4.34175,neutral,2.0,858305a5450b7bd1288ba0053b1cd1c1,,,
3,fb0ae78586a235018103acec22a80a8f,wavs/fb0ae78586a235018103acec22a80a8f.wav,3.900562,neutral,,858305a5450b7bd1288ba0053b1cd1c1,сколько стоит на керамбит,neutral,80bc833cf6b3f106d2e8991783a31e2b
4,196dcf9e1aaac46c2aee45e7f6adfb92,wavs/196dcf9e1aaac46c2aee45e7f6adfb92.wav,4.78,neutral,,858305a5450b7bd1288ba0053b1cd1c1,афина когда закончится эта телепередача,neutral,bd78f079676fa5f1ed17253c9a440cc6


(79088, 9)


Unnamed: 0,hash_id,audio_path,duration,annotator_emo,golden_emo,annotator_id,speaker_text,speaker_emo,source_id
0,9e9961c53ca6eeb440b217e539fbf46c,wavs/9e9961c53ca6eeb440b217e539fbf46c.wav,5.82,neutral,,858305a5450b7bd1288ba0053b1cd1c1,я слушаю,neutral,4282ddc30d71ef420e202e0c60391e9f
1,0166f65a30354db8282682b1a280e64c,wavs/0166f65a30354db8282682b1a280e64c.wav,3.7,sad,,858305a5450b7bd1288ba0053b1cd1c1,каким стал сбер,neutral,d70dc98ed56e9362eaefefb7b2827c8f
2,d49a6b560155831725a7bdc7d0a96099,wavs/d49a6b560155831725a7bdc7d0a96099.wav,4.38,neutral,,858305a5450b7bd1288ba0053b1cd1c1,где родился шерлок холмс,neutral,0ee35d2abecf4272ecc8e1539b0839d8
3,c6852b0925797612d7b6724da8cbe7b4,wavs/c6852b0925797612d7b6724da8cbe7b4.wav,8.58,neutral,,858305a5450b7bd1288ba0053b1cd1c1,открой в браузере ennio morricone,neutral,0855e363c1787df1592f58f7a27ebe13
4,0166f65a30354db8282682b1a280e64c,wavs/0166f65a30354db8282682b1a280e64c.wav,3.7,sad,,a5562e26cd8f1949488a2d1e1e549d97,каким стал сбер,neutral,d70dc98ed56e9362eaefefb7b2827c8f


In [12]:
raw_crowd_train[raw_crowd_train.hash_id == '5d9560dd2cba88b2dc87b6b4d5b6a29d']

Unnamed: 0,hash_id,audio_path,duration,annotator_emo,golden_emo,annotator_id,speaker_text,speaker_emo,source_id
412403,5d9560dd2cba88b2dc87b6b4d5b6a29d,wavs/5d9560dd2cba88b2dc87b6b4d5b6a29d.wav,0.347875,other,,09184134bd1ddeb646205ba8e981fba8,фильмы меньшова,sad,24725b876b5e72993ec6c35688f754b8
412437,5d9560dd2cba88b2dc87b6b4d5b6a29d,wavs/5d9560dd2cba88b2dc87b6b4d5b6a29d.wav,0.347875,other,,076ffc89109d8d0cb8727de8f75b5c94,фильмы меньшова,sad,24725b876b5e72993ec6c35688f754b8
412457,5d9560dd2cba88b2dc87b6b4d5b6a29d,wavs/5d9560dd2cba88b2dc87b6b4d5b6a29d.wav,0.347875,other,,62942acb4975e3cac00d06726a0dfd83,фильмы меньшова,sad,24725b876b5e72993ec6c35688f754b8
414061,5d9560dd2cba88b2dc87b6b4d5b6a29d,wavs/5d9560dd2cba88b2dc87b6b4d5b6a29d.wav,0.347875,other,,a30fefe82e1e460f186efe6e9bbf9c58,фильмы меньшова,sad,24725b876b5e72993ec6c35688f754b8


In [13]:
type(raw_crowd_train.hash_id.unique())

numpy.ndarray

In [14]:
len(raw_crowd_train.hash_id.unique()), len(raw_crowd_test.hash_id.unique())

(184633, 17217)

In [114]:
tf_idf_vectorizer:TfidfVectorizer = TfidfVectorizer(tokenizer=text_to_normalized_tokens)
# tfs = tf_idf_vectorizer.fit_transform(['cat dog', 'dog bug'])

# s = 'cat, dog, bug, bug'
# response = tf_idf_vectorizer.transform([s])
# print(response)
# tf_idf_vectorizer
tf_idf_vectorizer.fit(raw_crowd_train.speaker_text.dropna().unique())




### Analyse

In [23]:
def read_HLF_file(
    HLF_file_path:Path = PROCESSED_DUSHA_CROWD_TRAIN_HLF_LAST_VERSION_FILE_PATH,
    )->List[HashHLF]:
    hash_HLF_list:List[HashHLF] = []
    with open(HLF_file_path) as f:
        for line in f:
            el:Optional[HashHLF] = eval(eval(line)) if eval(line) is not None else None
            if el is not None:
                hash_HLF_list.append(el)
                
    return hash_HLF_list

In [24]:
def raw_crowd_2_raw_crowd_HLF_table_format(
    raw_crowd:pd.DataFrame,
    ):
    raw_crowd_unique_hashes:pd.DataFrame = raw_crowd[~raw_crowd.hash_id.duplicated()]
    # raw_crowd_train_unique_hashes_only_goldens:pd.DataFrame = raw_crowd_train_unique_hashes[~raw_crowd_train_unique_hashes.golden_emo.isna()]
    raw_crowd_unique_hashes_with_speaker_emo:pd.DataFrame = raw_crowd_unique_hashes[~raw_crowd_unique_hashes.speaker_emo.isna()]
    raw_crowd_unique_hashes_with_speaker_emo_with_speaker_text:pd.DataFrame = raw_crowd_unique_hashes_with_speaker_emo[~raw_crowd_unique_hashes_with_speaker_emo.speaker_text.isna()]
    raw_crowd_unique_hashes_with_speaker_emo_with_speaker_text.set_index(HASH_ID_COLUMN_NAME, drop=True, inplace=True)
    # raw_crowd_train_unique_hashes_only_goldens_with_speaker_text.golden_emo = raw_crowd_train_unique_hashes_only_goldens_with_speaker_text.golden_emo.apply(lambda x: GoldenEmo(round(x)).name)
    # raw_crowd_train_unique_hashes_only_goldens.index.name=None
    return raw_crowd_unique_hashes_with_speaker_emo_with_speaker_text

In [25]:
def HLF_withspeaker_emottions_table(
    raw_crowd:pd.DataFrame,
    HLF_file_path:Path,
    )->pd.DataFrame:
    hash_HLF_list:List[HashHLF] = read_HLF_file(HLF_file_path=HLF_file_path)
    HLF_table:pd.DataFrame = hash_HLF_list_2_df(l=hash_HLF_list)
    raw_crowd_unique_hashes_with_speaker_emo_with_speaker_text:pd.DataFrame = raw_crowd_2_raw_crowd_HLF_table_format(raw_crowd=raw_crowd)

    HLF_with_speaker_emotions:pd.DataFrame = HLF_table.join(raw_crowd_unique_hashes_with_speaker_emo_with_speaker_text.speaker_emo, how=JOIN_HOW_INNER)
    return HLF_with_speaker_emotions
    


In [26]:
HLF_with_speaker_emotions_train:pd.DataFrame = HLF_withspeaker_emottions_table(
    raw_crowd=raw_crowd_train,
    HLF_file_path=PROCESSED_DUSHA_CROWD_TRAIN_HLF_LAST_VERSION_FILE_PATH
)
print(HLF_with_speaker_emotions_train.shape)
display(HLF_with_speaker_emotions_train.head())

(182939, 8)


Unnamed: 0,loudness,HF_power_ratio,pronounce_speed_WPS,pronounce_speed_LPS,pronounce_speed_SPS,transcription_features_mean_words_length,transcription_features_profanity_words_ratio,speaker_emo
475e76f77ac1ed7cabafca740b15b32a,74.022022,0.02393,9.783938,9.783938,4.891969,3.0,0.0,angry
2f9438ef68395c70a8714dc373a49d11,59.970772,0.03361,3.017241,3.017241,1.077586,3.5,0.0,neutral
fb0ae78586a235018103acec22a80a8f,67.891044,0.008438,5.640212,5.640212,2.050986,5.5,0.0,neutral
196dcf9e1aaac46c2aee45e7f6adfb92,46.226898,0.003612,7.322176,7.322176,3.556485,7.0,0.0,neutral
41d7f48ca93b01e4a01a4f34b40a69ff,56.706196,0.052391,8.438819,8.438819,4.008439,5.714286,0.0,neutral


In [27]:
HLF_with_speaker_emotions_test:pd.DataFrame = HLF_withspeaker_emottions_table(
    raw_crowd=raw_crowd_test,
    HLF_file_path=PROCESSED_DUSHA_CROWD_TEST_HLF_LAST_VERSION_FILE_PATH
)
print(HLF_with_speaker_emotions_test.shape)
HLF_with_speaker_emotions_test.head()

(17217, 8)


Unnamed: 0,loudness,HF_power_ratio,pronounce_speed_WPS,pronounce_speed_LPS,pronounce_speed_SPS,transcription_features_mean_words_length,transcription_features_profanity_words_ratio,speaker_emo
9e9961c53ca6eeb440b217e539fbf46c,51.384979,0.132317,1.202749,1.202749,0.687285,3.5,0.0,neutral
0166f65a30354db8282682b1a280e64c,39.728794,0.249508,3.513514,3.513514,1.081081,4.333333,0.0,neutral
d49a6b560155831725a7bdc7d0a96099,52.689034,0.029966,4.794521,4.794521,1.598174,5.25,0.0,neutral
c6852b0925797612d7b6724da8cbe7b4,63.896108,0.014975,3.379953,3.379953,0.699301,5.8,0.0,neutral
64a7aa17132c3e4b7be1aaed5fc88090,69.266373,0.091205,4.545455,4.545455,1.976285,3.285714,0.0,positive


In [74]:
X_train = HLF_with_speaker_emotions_train.drop(columns=[SPEAKER_EMOTION_FIELD_NAME])  
y_train = HLF_with_speaker_emotions_train[SPEAKER_EMOTION_FIELD_NAME]                

X_test = HLF_with_speaker_emotions_test.drop(columns=[SPEAKER_EMOTION_FIELD_NAME])   
y_test = HLF_with_speaker_emotions_test[SPEAKER_EMOTION_FIELD_NAME]                  

# Initialize CatBoostClassifier
model = CatBoostClassifier(
    iterations=100,       # Number of boosting iterations
    learning_rate=0.1,    # Learning rate
    depth=6,              # Depth of the trees
    verbose=0           # Print progress every 100 iterations
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# If you have the true labels for the test set, evaluate accuracy
if SPEAKER_EMOTION_FIELD_NAME in HLF_with_speaker_emotions_test.columns:
    accuracy = accuracy_score(y_test, y_pred)

print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

       angry       0.55      0.40      0.46      2853
     neutral       0.48      0.89      0.62      7462
    positive       0.33      0.00      0.00      2279
         sad       0.37      0.10      0.16      4623

    accuracy                           0.48     17217
   macro avg       0.43      0.35      0.31     17217
weighted avg       0.44      0.48      0.39     17217



In [92]:
# classification_report_formatted(y_true=y_test, y_pred=y_pred) 

Unnamed: 0,angry,neutral,positive,sad,macro avg,weighted avg
precision,0.55,0.48,0.33,0.37,0.43,0.44
recall,0.4,0.89,0.0,0.1,0.35,0.48
f1-score,0.46,0.62,0.0,0.16,0.31,0.39
support,2853.0,7462.0,2279.0,4623.0,17217.0,17217.0


In [47]:
normalized_feature_importance:pd.Series = pd.Series(index=X_train.columns, data=model.feature_importances_ / (sum(model.feature_importances_))).sort_values(ascending=False)
normalized_feature_importance

transcription_features_profanity_words_ratio    0.248369
transcription_features_mean_words_length        0.241024
loudness                                        0.201275
HF_power_ratio                                  0.108504
pronounce_speed_SPS                             0.100302
pronounce_speed_LPS                             0.051853
pronounce_speed_WPS                             0.048673
dtype: float64

## Text embeddings

In [217]:
# from transformers import AutoTokenizer, AutoModel
# import torch

# # Initialize model and tokenizer
# ROSBERTA_EMBEDDER_MODEL_NAME:str = 'ai-forever/ru-en-RoSBERTa'
# tokenizer = AutoTokenizer.from_pretrained(ROSBERTA_EMBEDDER_MODEL_NAME)
# model = AutoModel.from_pretrained(ROSBERTA_EMBEDDER_MODEL_NAME)

# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output.last_hidden_state
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# # Russian text processing
# texts = ["Ваш текст на русском языке здесь"]
# encoded_input = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

# with torch.no_grad():
#     model_output = model(**encoded_input)

# # Choose pooling method
# embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# # embeddings = model_output.last_hidden_state[:,0]  # CLS pooling alternative


In [251]:
def mean_pooling(
    model_output:transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions, 
    attention_mask:torch.Tensor,
    )->torch.Tensor:
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=CLAMP_MIN)


In [242]:
model_path:Path = Path('/data01/vvkiselev/data/other/dpl/models/ru-en-RoSBERTa')
tokenizer:transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast = AutoTokenizer.from_pretrained(model_path)
model:transformers.models.roberta.modeling_roberta.RobertaModel = AutoModel.from_pretrained(model_path).to(ROSBERTA_DEVICE)

Some weights of RobertaModel were not initialized from the model checkpoint at /data01/vvkiselev/data/other/dpl/models/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
divide_into_chunks

In [244]:
def texts_2_embeddings(
    texts:List[str],
    tokenizer:transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
    model:transformers.models.roberta.modeling_roberta.RobertaModel,
    padding:bool=True,
    truncation:bool=False,
    return_tensors=TORCH_TENSORS_KEYWOED,
    device=ROSBERTA_DEVICE,
    attention_mask_keyword:str=ATTENTION_MASK_KEYWORD,
    normalize_p:int = ROSBERTA_NORMALIZE_P,
    normalize_dim:int = ROSBERTA_NORMALIZE_DIM,
    )->torch.Tensor:
    inputs:transformers.tokenization_utils_base.BatchEncoding = tokenizer(
        texts,
        padding=padding,
        truncation=truncation,
        return_tensors=return_tensors,
    ).to(device)  # Move inputs to GPU

    # Generate embeddings
    with torch.no_grad():
        outputs:transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions = model(**inputs)

    # Apply manual pooling
    sentence_embeddings:torch.Tensor = mean_pooling(
        model_output=outputs, 
        attention_mask=inputs[attention_mask_keyword],
    )
    embeddings:torch.Tensor = torch.nn.functional.normalize(
        sentence_embeddings, 
        p=normalize_p, 
        dim=normalize_dim,
    )

    # print(f"Embedding shape: {embeddings.shape}")  # Output: torch.Size([1, 1024])
    return embeddings



In [269]:
embs:torch.Tensor = texts_2_embeddings(
    # texts=['Пример русского текста для анализа','я пошел гулять'],
    texts=list(filter(lambda x: isinstance(x, str), list(raw_crowd_train[SPEAKER_TEXT_FIELD_NAME].head(5000).unique()))),
    tokenizer=tokenizer,
    model=model,
)
embs.shape

torch.Size([2112, 1024])

In [256]:
raw_crowd_train.head()[SPEAKER_TEXT_FIELD_NAME].to_list()

['не надо не надо не надо не надо',
 'фозил кори mp три',
 nan,
 'сколько стоит на керамбит',
 'афина когда закончится эта телепередача']

In [296]:
def transcriptions_series_to_text_2_emb(
    transcriptions_series:pd.Series,
    tokenizer:transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
    model:transformers.models.roberta.modeling_roberta.RobertaModel,
    num_chunks:int,
    padding:bool=True,
    truncation:bool=False,
    return_tensors=TORCH_TENSORS_KEYWOED,
    device=ROSBERTA_DEVICE,
    attention_mask_keyword:str=ATTENTION_MASK_KEYWORD,
    normalize_p:int = ROSBERTA_NORMALIZE_P,
    normalize_dim:int = ROSBERTA_NORMALIZE_DIM,
    )->Dict[str, torch.Tensor]:
    unique_texts:List[str] = list(
        filter(
            lambda x: isinstance(x, str), 
            list(transcriptions_series.unique())
        )
    )
    print(f'len(unique_texts) = {len(unique_texts)}')

    chunks:List[List[str]] = divide_into_chunks(unique_texts, num_chunks)
    unique_text_2_embedding:Dict[str, torch.Tensor] = {}
    for chunk in tqdm(chunks):
        chunk_embeddings:torch.Tensor = texts_2_embeddings(
            texts=chunk,
            tokenizer=tokenizer,
            model=model,
            padding=padding,
            truncation=truncation,
            return_tensors=return_tensors,
            device=device,
            attention_mask_keyword=attention_mask_keyword,
            normalize_p=normalize_p,
            normalize_dim=normalize_dim,
        ).cpu()
        # print(f'chunk_embeddings.shape = {chunk_embeddings.shape}')
        for chunk_i in range(len(chunk)):
            unique_text_2_embedding[chunk[chunk_i]] = chunk_embeddings[chunk_i]

    return unique_text_2_embedding
        

In [304]:
text_2_emb_train:Dict[str, torch.Tensor] = transcriptions_series_to_text_2_emb(
    transcriptions_series=raw_crowd_train[SPEAKER_TEXT_FIELD_NAME],
    tokenizer=tokenizer,
    model=model,
    num_chunks=100,
)

len(unique_texts) = 124568


100%|██████████| 100/100 [02:08<00:00,  1.29s/it]


In [310]:
text_2_emb_series_train:pd.Series = pd.Series(index=text_2_emb_train.keys(), data=map(repr, text_2_emb_train.values()))
text_2_emb_series_train.head()

не надо не надо не надо не надо                   tensor([ 0.0409,  0.0645, -0.0062,  ...,  0.01...
фозил кори mp три                                 tensor([-0.0066,  0.0331,  0.0023,  ...,  0.01...
сколько стоит на керамбит                         tensor([ 0.0246,  0.0035,  0.0029,  ..., -0.02...
афина когда закончится эта телепередача           tensor([ 0.0242,  0.0469,  0.0188,  ...,  0.01...
где проживают дети путина тихонова и воронцова    tensor([-0.0016, -0.0451, -0.0002,  ...,  0.05...
dtype: object

In [333]:
torch.save(text_2_emb_train, PROCESSED_DUSHA_CROWD_TRAIN_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH)

In [332]:
# Load the dictionary from the file
# loaded_dict = torch.load(PROCESSED_DUSHA_CROWD_TRAIN_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH, weights_only=False)
# list(loaded_dict.items())[0][1].shape

FileNotFoundError: [Errno 2] No such file or directory: '/data01/vvkiselev/data/other/dpl/processed/dusha/crowd/train/text_embeddings/v1.pt'

In [297]:
text_2_emb_test:Dict[str, torch.Tensor] = transcriptions_series_to_text_2_emb(
    transcriptions_series=raw_crowd_test[SPEAKER_TEXT_FIELD_NAME],
    tokenizer=tokenizer,
    model=model,
    num_chunks=100,
)

len(unique_texts) = 16628


100%|██████████| 100/100 [00:14<00:00,  6.82it/s]


In [334]:
torch.save(text_2_emb_test, PROCESSED_DUSHA_CROWD_TEST_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH)

In [335]:
loaded_dict = torch.load(PROCESSED_DUSHA_CROWD_TEST_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH, weights_only=False)
list(loaded_dict.items())[0][1].shape

torch.Size([1024])

In [None]:
# PROCESSED_DUSHA_CROWD_TEST_TEXT_EMBEDDINGS_LAST_VERSION_FILE_PATH

# Analyse