In [None]:
from asynchat import simple_producer

import pandas as pd
from datasets import load_dataset
from datasets import Dataset, DatasetDict

import tqdm as notebook_tqdm

import nltk
import re

import spacy
from pygments.lexer import words
from spacy.lang.ru.examples import sentences

In [None]:
# загрузим модель spacy для русского языка
spacy_model = spacy.load('ru_core_news_lg')
# seara/ru_go_emotions
goemotions = load_dataset("seara/ru_go_emotions", "raw")
# djacon/ru-izard-emotions
izard = load_dataset("djacon/ru-izard-emotions")

labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
emo_labels = ["text"] + labels

In [None]:
crypto_df_train = pd.read_csv('content/train.csv', encoding='utf-8')
crypto_df_val = pd.read_csv('content/valid.csv', encoding='utf-8')
goemotions_df= goemotions["train"].to_pandas().rename(columns={"ru_text": "text", "text": "en_text"})[emo_labels]
izard_df_train = izard["train"].to_pandas()[emo_labels]
izard_df_test = izard["test"].to_pandas()[emo_labels]
izard_df_val = izard["validation"].to_pandas()[emo_labels]

In [None]:
goemotions_df['emotion_combination'] = goemotions_df[labels].astype(str).agg(','.join, axis=1)
goemotions_df['emotion_combination'] = goemotions_df['emotion_combination'].apply(lambda x: tuple(map(int, x.split(','))))
goemotions_df['emotion_combination'].apply(lambda x: sum(x) == 0).value_counts()

In [None]:
crypto_df_train['source'] = 'crypto'
crypto_df_train['type'] = 'train'

crypto_df_val['source'] = 'crypto'
crypto_df_val['type'] = 'validation'

goemotions_df['source'] = 'goemotions'
goemotions_df['type'] = 'train'

izard_df_train['source'] = 'izard'
izard_df_train['type'] = 'train'

izard_df_test['source'] = 'izard'
izard_df_test['type'] = 'test'

izard_df_val['source'] = 'izard'
izard_df_val['type'] = 'validation'

In [None]:
data = pd.concat([crypto_df_train, crypto_df_val, goemotions_df, izard_df_train, izard_df_test, izard_df_val])
data['emotion_combination'] = data[labels].astype(str).agg(','.join, axis=1)
data['emotion_combination'] = data['emotion_combination'].apply(lambda x: tuple(map(int, x.split(','))))

In [None]:
data.head()

In [None]:
# Удалим записи, в которых поле 'emotion_combination' содержит только нули
data = data.drop(data[data['emotion_combination'] == (0, 0, 0, 0, 0, 0, 0)].index)
data['emotion_combination'].apply(lambda x: sum(x) == 0).value_counts()

In [None]:
data.shape

In [None]:
# Выведем количество записей для каждой группы, где в группе представлена только одна эмоция (сума эмоций равна 1)
data['emotion_combination'].apply(lambda x: sum(x) == 1).value_counts()

In [None]:
# Выведем количество записей для каждой группы, где в группе несколько эмоций (сума эмоций больше 1)
data['emotion_combination'].apply(lambda x: sum(x) > 1).value_counts()

In [None]:
# Выведем символы, которые встречаются в текстах
chars = set()
for text in data['text']:
    for char in text:
        chars.add(char)
print(chars)

In [None]:
def cleaner(example):
    example = re.sub(r"\[.*?\]", "", example) # Удалим все, что находится в квадратных скобках
    example = example.lower()
    example = re.sub(r"[^a-zа-я\d]", " ", example)
    example = re.sub(r"\s+", " ", example)
    example = example.strip()
    return example

In [None]:
data['tokens'] = data['text'].apply(lambda x: cleaner(x))

In [None]:
# Выведем символы, которые встречаются в текстах
chars = set()
for text in data['tokens']:
    for char in text:
        chars.add(char)
print(chars)

In [None]:
def cleaner(example, deep_clean=False):
    lemmas = spacy_model(example)
    for token in lemmas:
        if deep_clean:
            if not (token.is_stop or token.is_punct or token.is_space or token.lang_ != 'ru' or token.is_digit or token.like_num or token.like_url or token.like_email):
                yield token.lemma_
        else:
            if not (token.is_punct or token.is_space or token.lang_ != 'ru'):
                yield token.lemma_

In [None]:
# Удалим стоп-слова
data['deep_tokens'] = data['tokens'].apply(lambda x: list(cleaner(x, deep_clean=True)))
data['tokens'] = data['tokens'].apply(lambda x: list(cleaner(x)))

In [None]:
def morph(text):
    lemmas = spacy_model(text)
    for token in lemmas:
        if not (token.is_stop or token.is_punct or token.is_space or token.lang_ != 'ru' or token.is_digit):
            yield f'lemma::{token.lemma_}<pos:{token.pos_}><dep:{token.dep_}><head lemma:{token.head.lemma_}>'


In [None]:
data['morph_tokens'] = data['text'].apply(lambda x: list(morph(x)))

In [None]:
data.iloc[0]

In [None]:
data.to_csv('content/newdata.csv', index=False)

In [None]:
# Проверим, есть ли пустые строки в поле 'tokens'
data['deep_tokens'].apply(lambda x: len(x) == 0).value_counts()

In [None]:
# Удалим записи, в которых поле 'tokens' пустое
data = data.drop(data[data['deep_tokens'].apply(lambda x: len(x) == 0)].index)

In [None]:
# Проверим, есть ли пустые строки в поле 'tokens'
data['deep_tokens'].apply(lambda x: len(x) == 0).value_counts()

In [None]:
data.head()

In [None]:
import gensim.downloader as api
from gensim.utils import simple_preprocess

In [None]:
info_datasets = api.info()
#print(api.info('word2vec-ruscorpora-300'))
w2v = api.load('word2vec-ruscorpora-300')

In [None]:
data['tokens']

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('content/newdata.csv')

In [13]:
len(df['emotion_combination'].unique())

103

In [16]:
# Выведем классы с минимальным количеством записей
df['emotion_combination'].value_counts()#.sort_values(ascending=True)

emotion_combination
(0, 0, 0, 0, 0, 0, 1)    62399
(0, 0, 0, 1, 0, 0, 0)    16162
(1, 0, 0, 0, 0, 0, 0)    10268
(0, 0, 0, 0, 1, 0, 0)     7933
(0, 0, 0, 0, 0, 1, 0)     7275
                         ...  
(0, 0, 1, 1, 1, 0, 1)        1
(1, 0, 0, 1, 1, 1, 1)        1
(1, 0, 1, 0, 1, 1, 1)        1
(0, 1, 1, 0, 0, 1, 1)        1
(1, 1, 0, 1, 1, 0, 1)        1
Name: count, Length: 103, dtype: int64

In [17]:
# Дропнем записи, в которых количество записей меньше 10
df = df.groupby('emotion_combination').filter(lambda x: len(x) > 10)

In [18]:
df['emotion_combination'].value_counts()

emotion_combination
(0, 0, 0, 0, 0, 0, 1)    62399
(0, 0, 0, 1, 0, 0, 0)    16162
(1, 0, 0, 0, 0, 0, 0)    10268
(0, 0, 0, 0, 1, 0, 0)     7933
(0, 0, 0, 0, 0, 1, 0)     7275
(0, 1, 0, 0, 0, 0, 0)     4970
(0, 0, 1, 0, 0, 0, 0)     3576
(0, 0, 0, 1, 0, 0, 1)      716
(1, 1, 0, 0, 0, 0, 0)      708
(1, 0, 0, 0, 1, 0, 0)      666
(0, 0, 0, 1, 0, 1, 0)      651
(1, 0, 0, 0, 0, 0, 1)      617
(0, 0, 0, 1, 1, 0, 0)      434
(1, 0, 0, 1, 0, 0, 0)      401
(0, 0, 0, 0, 1, 0, 1)      318
(0, 0, 0, 0, 0, 1, 1)      260
(0, 0, 1, 0, 1, 0, 0)      228
(1, 0, 0, 0, 0, 1, 0)      216
(1, 1, 0, 0, 1, 0, 0)      204
(0, 1, 0, 0, 1, 0, 0)      186
(0, 0, 0, 0, 1, 1, 0)      185
(1, 0, 0, 0, 1, 0, 1)      136
(0, 1, 1, 0, 0, 0, 0)      105
(0, 0, 1, 0, 0, 1, 0)       93
(1, 1, 0, 0, 0, 0, 1)       91
(1, 0, 1, 0, 0, 0, 0)       85
(0, 1, 0, 0, 0, 0, 1)       82
(1, 0, 0, 1, 1, 0, 0)       82
(0, 0, 1, 1, 0, 0, 0)       73
(1, 0, 0, 1, 0, 0, 1)       65
(0, 0, 1, 0, 0, 0, 1)       64
(0, 1, 0, 0, 0, 1, 

In [19]:
df.to_csv('content/clipped_data.csv', index=False)

In [20]:
train_df = df[df['type'] == 'train']
val_df = df[df['type'] == 'validation']
test_df = df[df['type'] == 'test']

In [22]:
train_df.to_csv('cl_train.csv', index=False)
val_df.to_csv('cl_val.csv', index=False)
test_df.to_csv('cl_test.csv', index=False)