In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import (
    DebertaV2Tokenizer,
    DebertaV2ForSequenceClassification,
    Trainer,
    TrainingArguments,
    get_polynomial_decay_schedule_with_warmup,
)
from tqdm import tqdm


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [None]:
class CFG:
    split_text = True
    split_text_improved = True
    max_length = 512

In [None]:
tokenizer = DebertaV2Tokenizer.from_pretrained(
    "microsoft/deberta-v3-base"
)

In [None]:
data = pd.read_parquet("./large_dataset/data.parquet")
print(data.shape)

In [None]:
data.head()

In [None]:
data.drop_duplicates(subset=["text"], inplace=True)
print(data.shape)

In [None]:
data = data[["text", "source", "prompt_id"]]
data.head()

In [None]:
if CFG.split_text_improved:

    def split_text_improved(row, max_length=CFG.max_length - 10):
        tokens = tokenizer.encode(row["text"], add_special_tokens=False)
        if len(tokens) <= max_length:
            return [row.to_dict()]

        new_rows = []
        current_chunk = []
        current_length = 0

        for token in tokens:
            current_chunk.append(token)
            current_length += 1

            if tokenizer.decode([token]) in ["."]:
                if current_length >= max_length:
                    text_chunk = tokenizer.decode(
                        current_chunk,
                        clean_up_tokenization_spaces=True
                    )
                    new_row = row.to_dict()
                    new_row["text"] = text_chunk
                    new_rows.append(new_row)

                    current_chunk = []
                    current_length = 0

        if current_chunk:
            text_chunk = tokenizer.decode(
                current_chunk,
                clean_up_tokenization_spaces=True
            )
            new_row = row.to_dict()
            new_row["text"] = text_chunk
            new_rows.append(new_row)

        return new_rows

    new_rows = data.apply(split_text_improved, axis=1).tolist()
    flattened_rows = [item for sublist in new_rows for item in sublist]
    new_df = pd.DataFrame(flattened_rows)
    new_df = new_df.reset_index(drop=True)

    print(new_df.shape)
    print(new_df.head())
    new_df.to_csv("./large_dataset/split_text_improved.csv", index=False)

In [None]:
if CFG.split_text:
    def split_text(row, max_length=CFG.max_length + 10):
        tokens = tokenizer.encode(row["text"], add_special_tokens=False)
        if len(tokens) <= max_length:
            return [row.to_dict()]

        chunks = [tokens[i:i + max_length]
                  for i in range(0, len(tokens), max_length)]
        new_rows = []
        for chunk in chunks:
            text_chunk = tokenizer.decode(
                chunk,
                clean_up_tokenization_spaces=True
            )
            new_row = row.to_dict()
            new_row["text"] = text_chunk
            new_rows.append(new_row)
        return new_rows

    new_rows = data.apply(split_text, axis=1).tolist()
    flattened_rows = [item for sublist in new_rows for item in sublist]
    new_df = pd.DataFrame(flattened_rows)

    new_df = new_df.reset_index(drop=True)

    print(new_df.shape)
    print(new_df.head())
    new_df.to_csv("./large_dataset/split_text_simple.csv", index=False)