#### Basic preprocessing

In [None]:
import json
from dotenv import dotenv_values

configs = dotenv_values(".env")

TARGET_NAME = configs["NAME"]

In [None]:
with open('./raw/autists.json', mode="r", encoding="utf-8") as f:
    autists = json.load(f)['messages']

with open("./raw/download.json", mode="r", encoding="utf-8") as f:
    download = json.load(f)['messages']

with open("./raw/gycha.json", mode="r", encoding="utf-8") as f:
    gycha = json.load(f)['messages']

dataset = gycha + download + autists
len(dataset)

In [None]:
def filter(obj):
    # delete messages with empty text or forwarded messages
    if obj['text'] == "" or 'forwarded_from' in obj:
        return None

    if type(obj['text']) == str:
        return obj
    else:
        for_deleting = []
        for j in range(len(obj['text'])):
            if type(obj['text'][j]) != str:
                # delete all links-only messages
                if obj['text'][j]['type'] == 'link':
                    for_deleting.append(j)
                else:
                    # replace all entities with their texts
                    obj['text'][j] = obj['text'][j]['text']
        for i in reversed(for_deleting):
            del obj['text'][i]

        if obj['text'] == []:
            return None
        if type(obj['text']) == list:
            obj['text'] = ''.join(obj['text'])
        return obj

In [None]:
dataset = [x for x in list(map(filter, dataset)) if x is not None]
len(dataset)

In [None]:
# dump of basic filtered version
with open('result.json', mode="w", encoding='utf8') as f:
    json.dump(dataset, f, indent=1, ensure_ascii=False)

#### Loading basic preprocessing results

In [None]:
with open('result.json', mode="r", encoding="utf-8") as f:
    dataset = json.load(f)

#### Basic EDA

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def count_lengths(dataset, key, tokenize = lambda message, key: message[key].split(' ')):
    lengths = {}

    for message in dataset:
        length = len(tokenize(message,key))
        if str(length) in lengths.keys():
            lengths[str(length)] += 1
        else:
            lengths[str(length)] = 1

    lengths = (
        pd.DataFrame({"length": lengths.keys(), "count": lengths.values()})
        .astype({"length": "int32"})
        .sort_values(by=["count"], ascending=False)
    )
    return lengths
lengths = count_lengths(dataset, 'text')
lengths.head(15)

In [None]:
lengths.head(50)["count"].sum() / lengths["count"].sum()

In [None]:
def draw_lengths(lengths, top):
    top_length = lengths.loc[lengths["length"] <= top]
    plt.figure(figsize=(18, 2))


    plt.bar(
        top_length["length"],
        top_length["count"],
    )
    plt.xticks(top_length["length"], top_length["length"], rotation="vertical")
    plt.show()

draw_lengths(lengths, 100)

In [None]:
counter = 0
for message in dataset:
    if message["from"] == TARGET_NAME:
        counter += 1
print(
    f"Count of {TARGET_NAME} messages: {counter}/{len(dataset)}={counter/len(dataset)}"
)

In [None]:
from ydata_profiling import ProfileReport
import pandas as pd

ProfileReport(
    pd.DataFrame(dataset)[["from", "text"]],
).to_notebook_iframe()

In [None]:
from wordcloud import WordCloud
from tqdm import tqdm

text_string = ''

for message in tqdm(dataset):
    text_string += message["text"]
    text_string += ' '

print(len(text_string))

In [None]:
wc = WordCloud(max_font_size=200, max_words=300, background_color="white", width = 1000, height = 800, collocations = False).generate(
    text_string
)

plt.figure()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

#### Parallel texts corpus (for generation task)

In [None]:
# split the corpus into parallel texts
import copy

datset_for_parallel = copy.deepcopy(dataset)
# delete messages of target user if they are at the start of the dataset
for_deleting = []
for i in range(len(datset_for_parallel)):
    if datset_for_parallel[i]["from"] == TARGET_NAME:
        for_deleting.append(i)
    else:
        break
for i in reversed(for_deleting):
    del datset_for_parallel[i]

pairs = []
question = ''
answer = ''
last_from = ''
for i in range(len(datset_for_parallel)):
    # if we've just built q&a pair
    if question != '' and answer != '' and \
        last_from == TARGET_NAME and datset_for_parallel[i]['from'] != TARGET_NAME:
        pair = {'q': question, 'a': answer}
        pairs.append(pair)
        question = ''
        answer = ''

    if datset_for_parallel[i]["from"] == TARGET_NAME:
        answer += ' ' + datset_for_parallel[i]['text']
    else:
        question += ' ' + datset_for_parallel[i]['text']

    last_from = datset_for_parallel[i]['from']
# gonna ignore the last pair

len(pairs)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")

lengths = count_lengths(pairs, "q", lambda message, key: tokenizer(message[key])['input_ids'])
lengths.head(20)

In [None]:
lengths.head(64)["count"].sum() / lengths["count"].sum()

In [None]:
with open("parallel.json", mode="w", encoding="utf8") as f:
    json.dump(pairs, f, indent=1, ensure_ascii=False)

#### Binary marked corpus (for classification task)

In [None]:
# take only 3 and more words messages
minimum_length = 3

long_messages = []
for message in dataset:
    if len(message["text"].split(" ")) >= minimum_length:
        long_messages.append(message)

len(long_messages)

In [None]:
def mark_author(obj):
    return {"text": obj["text"], "label": int(obj["from"] == TARGET_NAME)}

author = list(map(mark_author, long_messages))

len(author)

In [None]:
def check_counts(dataset):
    counter = 0
    for message in dataset:
        counter += message["label"]
    print(f"Count of {TARGET_NAME} messages: {counter}/{len(dataset)}={counter/len(dataset)}")
check_counts(author)

The dataset is small and has a classes imbalance 2:1, hence we should apply oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

rus = RandomOverSampler(random_state=42)
X_res, y_res = rus.fit_resample([[message["text"]] for message in author], [message["label"] for message in author])
balanced_author = [{"text": X_res[i][0], "label": y_res[i]} for i in range(len(X_res))]

check_counts(balanced_author)

In [None]:
author = balanced_author

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("sergeyzh/rubert-tiny-turbo")

lengths = count_lengths(author, "text", lambda message, key: tokenizer(message[key])['input_ids'])
lengths.head(20)

In [None]:
lengths.head(50)["count"].sum() / lengths["count"].sum()

In [None]:
with open('author.json', mode="w", encoding='utf8') as f:
    json.dump(author, f, indent=1, ensure_ascii=False)