v0.90

> Source:    
https://www.kaggle.com/datasets/nikitricky/the-ultimate-bulgarian-nlp-dataset?select=chitanka_info+chunks

In [1]:
import numpy as np
import pandas as pd
import sys
import warnings

In [2]:
IS_GUEST = False
LOAD_SAVED_DATA = True

In [3]:
if not IS_GUEST:
    !if [ ! -f "/content/helpers/split_text_to_sentences.py" ]; then wget -P helpers/ https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/split_text_to_sentences.py; fi
    !if [ ! -f "/content/helpers/clean_sentences.py" ]; then wget -P helpers/ https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/clean_sentences.py; fi

--2025-01-24 14:17:36--  https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/split_text_to_sentences.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2949 (2.9K) [text/plain]
Saving to: ‘helpers/split_text_to_sentences.py’


2025-01-24 14:17:36 (41.4 MB/s) - ‘helpers/split_text_to_sentences.py’ saved [2949/2949]

--2025-01-24 14:17:36--  https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/clean_sentences.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting re

In [4]:
if not IS_GUEST:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    root_dir = "/content/drive/MyDrive/softuni/the-grammar-whisperer"

data_raw_dir = f"{root_dir}/data/raw"
data_clean_dir = f"{root_dir}/data/clean"
helpers_dir = f"{root_dir}/helpers"

if root_dir not in sys.path:
    sys.path.append(root_dir)
    sys.path.append(helpers_dir)

Mounted at /content/drive


In [5]:
from split_text_to_sentences import split_and_save_sentences, save_sentences_to_csv
from clean_sentences import clean_all_sentences

In [6]:
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning) # 🧡

## Clean raw data containing Bulgarian sentences from fiction literature

###### Load all text files with Bulgarian fiction (raw) and split them to sentences

In [7]:
input_raw_texts = [
    "bg_fiction_1.txt",
    "bg_fiction_2.txt",
    "bg_fiction_3.txt",
    "bg_fiction_4.txt",
    "bg_fiction_5.txt",
    "bg_fiction_6.txt",
    "bg_fiction_7.txt",
    "bg_fiction_8.txt",
]

output_split_texts = [
    "bg_fiction_split_1.tsv",
    "bg_fiction_split_2.tsv",
    "bg_fiction_split_3.tsv",
    "bg_fiction_split_4.tsv",
    "bg_fiction_split_5.tsv",
    "bg_fiction_split_6.tsv",
    "bg_fiction_split_7.tsv",
    "bg_fiction_split_8.tsv",
]

output_clean_sentences = [
    "bg_fiction_clean_1.tsv",
    "bg_fiction_clean_2.tsv",
    "bg_fiction_clean_3.tsv",
    "bg_fiction_clean_4.tsv",
    "bg_fiction_clean_5.tsv",
    "bg_fiction_clean_6.tsv",
    "bg_fiction_clean_7.tsv",
    "bg_fiction_clean_8.tsv",
]

In [8]:
if not LOAD_SAVED_DATA:
    bg_fiction_all_sentences = None

    for raw_text, split_text in zip(input_raw_texts, output_split_texts):
        bg_text_path = f"{data_raw_dir}/{raw_text}"
        output_path = f"{data_clean_dir}/{split_text}"

        sentences = split_and_save_sentences(bg_text_path)
        save_sentences_to_csv(sentences, output_path)

###### Load all files with split sentences and clean them

In [15]:
if not LOAD_SAVED_DATA:
    for split_sent, clean_sent in zip(output_split_texts, output_clean_sentences):
        split_sent = f"{data_clean_dir}/{split_sent}"
        clean_sent = f"{data_clean_dir}/{clean_sent}"

        df = pd.read_csv(split_sent, sep="\t")
        df_clean = clean_all_sentences(df)

        df_clean.to_csv(clean_sent, sep="\t", index=False)

###### Load all files with clean sentences and combine them

In [16]:
if not LOAD_SAVED_DATA:
    df_all = None
    for clean_sent in output_clean_sentences:
        clean_sent = f"{data_clean_dir}/{clean_sent}"
        df = pd.read_csv(clean_sent, sep="\t")

        if df_all is None:
            df_all = df
        else:
            df_all = pd.concat([df_all, df], ignore_index=True)

###### Do some tyding on the combined dataframe

In [43]:
output_file = f"{data_clean_dir}/bg_fiction_all.tsv"

if not LOAD_SAVED_DATA:
    # remove duplicate rows
    df_all = df_all.drop_duplicates()

    # Remove rows where the length of the "text" column is less than 20 or greater than 100
    df_all = df_all[(df_all["text"].str.len() >= 20) & (df_all["text"].str.len() <= 100)]

    # sort df by column "text"
    df_all = df_all.sort_values(by="text")

    df_all = df_all.reset_index(drop=True)

    # remove rows from df where column "text" starts with "А -"
    df_all = df_all[~df_all["text"].str.startswith("А -")]

    df_all.to_csv(f"{data_clean_dir}/bg_fiction_all.tsv", sep="\t", index=False)

df_all = pd.read_csv(output_file, sep="\t")
df_all.shape

(6690845, 1)

In [44]:
for i in range(6000000, 6000020):
    print(df_all['text'].iloc[i])

Той си мисли, че съм го предала и нищо не може да го убеди в противното.
Той си мисли, че съм гушнал онези двайсет милиона долара в злато и скъпоценности.
Той си мисли, че съм заспал.
Той си мисли, че съм мъртъв, и от моя гледна точка това е доста здравословно.
Той си мисли, че съм пияна глупачка - подсмръкна Доръти.
Той си мисли, че тя го е прелъстила първия път и затова е трябвало да се ожени за нея.
Той си мисли, че тя се занимава с продажби на козметика по домовете.
Той си мисли, че ще се видите в сладкарницата.
Той си мисли, че щом една жена е отхвърлила мъжа си, всяка може да го направи, и така ме попита.
Той си мисли, че я е откраднал твърде лесно и че може да я използва без последствия.
Той си мисли: Е, отървах я от още един проблем!
Той си мисли: Е, тоя проблем е решен!
Той си мисли: Защо сега не млъкне и не ме остави на мира?
Той си мръдна главата - и с това се издаде.
Той си мърмореше нещо под нос, докато подтичваше по коридорите.
Той си мърмореше нещо под нос.
Той си мърмор