In [40]:
import os
from pathlib import Path

import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split

In [41]:
PATH_TO_DATA = Path(os.getcwd()).parent / 'dataset' / 'sarc_09-12.csv'
SAVE_PATH = Path(os.getcwd()).parent / 'dataset' / 'prepared'


def load_csv_data(path_to_data: Path) -> pl.LazyFrame:
    sarc_schema = pl.Schema(
        {
            'label': pl.UInt8,
            'comment': pl.Utf8,
            'user': pl.Utf8,
            'subreddit': pl.Utf8,
            'score': pl.Int8,
            'up': pl.UInt8,
            'down': pl.UInt8,
            'date': pl.Utf8,
            'timestamp': pl.Int64,
            'parent_comment': pl.Utf8,
            'embed_1': pl.Utf8,
            'embed_2': pl.Utf8
        }
    )

    loaded_lf = pl.scan_csv(
        path_to_data,
        separator='\t',
        has_header=False,
        with_column_names=lambda cols: [
            'label', 'comment', 'user', 'subreddit', 'score', 'up', 'down', 'date', 'timestamp', 'parent_comment',
            'embed_1', 'embed_2'
        ],
        schema=sarc_schema,
        low_memory=True,
    )

    # Убираем пустые строки и берем только нужные столбцы
    loaded_lf = loaded_lf.select(['comment', 'label', 'parent_comment']).filter(
        (pl.col("comment").is_not_null()) & (pl.col("comment").str.strip_chars().str.len_chars() > 0)
    )

    return loaded_lf


def create_balanced_dataset(lf: pl.LazyFrame) -> pl.LazyFrame:
    min_class_count = lf.select('label').group_by('label').len().min().select('len').collect().item()
    lf_after_balance = (lf
                        .select(['label', 'comment'])
                        .group_by('label')
                        .agg(pl.all().head(min_class_count))
                        .explode('comment')
                        )
    return lf_after_balance


def save_train_val_test_data(full_processed_path: Path, prefix: str | None = None):
    pandas_df = pd.read_csv(full_processed_path)
    train_val_data, test_data = train_test_split(pandas_df, test_size=0.1, random_state=42, stratify=pandas_df['label'])
    train_data, val_data = train_test_split(train_val_data, test_size=0.2, random_state=42,
                                            stratify=train_val_data['label'])

    for dataset, stage in zip([train_data, val_data, test_data], ['train', 'val', 'test']):
        if prefix:
            file_name = f'prepared_{prefix}_{stage}.csv'
        else:
            file_name = f'prepared_{stage}.csv'
        dataset.to_csv(SAVE_PATH / file_name, index=False)

    return train_data, val_data, test_data

# Подготовка выборки без объединения родительских комментариев

In [42]:
scanned_lf_without_parent = load_csv_data(PATH_TO_DATA)
balanced_lf_with_parent = create_balanced_dataset(scanned_lf_without_parent)

final_lf_without_parent = balanced_lf_with_parent.select(['comment', 'label'])
final_lf_without_parent.collect(streaming=True).write_csv(SAVE_PATH / 'prepared_full_without_parent.csv')


## Разбиваем на train / test / val

In [43]:
train_df_without_parent, val_df_without_parent, test_df_without_parent = save_train_val_test_data(
    SAVE_PATH / 'prepared_full_without_parent.csv')

display(train_df_without_parent)
display(train_df_without_parent.describe())

Unnamed: 0,comment,label
100741,Oh don't worry. It's just gruesome and graphi...,1
21082,looks like every one of your comments is about...,0
67128,Wait I is confused Al Goher tells me planet ge...,1
86013,She sounds like a ton of fun.,1
94247,But we don't pretend we care about anything ot...,1
...,...,...
101143,"Oh, and don't forget Honeydew! He's like, the ...",1
11788,Space pirates got it.,0
86093,That's some messy sentience structure there.,1
6559,Obviously you don't know how to spell either.,0


Unnamed: 0,label
count,90470.0
mean,0.5
std,0.500003
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


# Подготовка выборки с объединением родительских комментариев

In [44]:
scanned_lf_with_parent = load_csv_data(PATH_TO_DATA)

# Объединяем родительские комментарии
scanned_lf_with_parent = scanned_lf_with_parent.with_columns(
    pl.concat_str([
        pl.when((pl.col("parent_comment").is_not_null()) & (
                pl.col("parent_comment").str.strip_chars().str.len_chars() > 0))
        .then(pl.concat_str([pl.lit("[PARENT] "), pl.col("parent_comment")], separator=""))
        .otherwise(pl.lit("")),
        pl.lit(" [SEP] [COMMENT] "),
        pl.col("comment")
    ]).alias("comment"),
    pl.col("label")
).select(['comment', 'label'])

# Балансируем выборку
balanced_lf_with_parent = create_balanced_dataset(scanned_lf_with_parent)

final_lf_with_parent = balanced_lf_with_parent.select(['comment', 'label'])
final_lf_with_parent.collect(streaming=True).write_csv(SAVE_PATH / 'prepared_full_with_parent.csv')

## Разбиваем на train / test / val

In [45]:
train_df_with_parent, val_df_with_parent, test_df_with_parent = save_train_val_test_data(
    SAVE_PATH / 'prepared_full_with_parent.csv', prefix='with_parent')

display(train_df_with_parent)
display(train_df_with_parent.describe())

Unnamed: 0,comment,label
100741,[PARENT] So about R rated movies...(kind of a ...,1
21082,[PARENT] Ouch. Please don't beat the shit out ...,0
67128,[PARENT] Anyone know if predictions have been ...,1
86013,"[PARENT] Her words, ""I fucking hate the commer...",1
94247,[PARENT] Like guys don't laugh at incredibly s...,1
...,...,...
101143,[PARENT] Don't worry that one Link skin is sti...,1
11788,[PARENT] Where's the stereo? [SEP] [COMMENT] S...,0
86093,[PARENT] What I have learned in my first two m...,1
6559,[PARENT] How is it miss information. Obama sp...,0


Unnamed: 0,label
count,90470.0
mean,0.5
std,0.500003
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0
