v0.90

> TODO:
- bg_pos_raw_part1: load from folder, not from github
- run locally
- extract funcitons to .py files
- try to load with polars
- add descriptions and fix headers
- describe where this dataset came from
- run with IS_GUEST=True and LOAD_SAVED_DATA=True

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import warnings

import re
import time

from copy import copy, deepcopy

In [2]:
IS_GUEST = False
LOAD_SAVED_DATA = True

In [3]:
if not IS_GUEST:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    root_dir = "/content/drive/MyDrive/softuni/the-grammar-whisperer"
    bg_pos_raw_part1 = 'https://raw.githubusercontent.com/MirkaIvanova/datasets/refs/heads/main/bg/bg_pos_raw_part1.csv'
    bg_pos_raw_part2 = 'https://raw.githubusercontent.com/MirkaIvanova/datasets/refs/heads/main/bg/bg_pos_raw_part2.csv'

data_raw_dir = f"{root_dir}/data/raw"
data_clean_dir = f"{root_dir}/data/clean"
data_processed_dir = f"{root_dir}/data/processed"

bg_pos_raw_wiki1000_csv = f"{data_raw_dir}/wiki1000_plus_words.tsv"

Mounted at /content/drive


In [4]:
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning) # üß°

> File: Bulgarian Part Of Speech Dataset_alt,
       Bulgarian Part Of Speech Dataset_raw.csv <-- use this. I split into two to be smaller than 100MB and renamed to bg_pos_raw_part1.csv and bg_pos_raw_part2.csv.
Source: https://www.kaggle.com/datasets/auhide/bulgarian-part-of-speech-dataset


## Load raw data containing Bulgarian words and their part of speech

##### Load 2 CSVs and combine them



In [5]:
start_time = time.time()
df1 = pd.read_csv(bg_pos_raw_part1, sep='\t')
df2 = pd.read_csv(bg_pos_raw_part2, sep='\t')
print(f"Execution time: {time.time() - start_time} seconds")

Execution time: 6.033625364303589 seconds


> **Combine the two dataframes** vertically, the csv files were split
to less than 100MB in order to fit the github limit

In [6]:
df1.shape, df2.shape

((734614, 4), (613488, 4))

In [7]:
df = pd.concat([df1, df2], ignore_index=True)  # Combine and regenerate a new index
del df1 # so we don't accidentally refer to it in subsequent code
del df2

In [8]:
df.shape

(1348102, 4)

##### Remove duplicate rows

In [9]:
df.head(10)

Unnamed: 0,word,lemma,form,pos
0,–∞,–∞,–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞,—Å—ä—é–∑
1,–∞,–∞,–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞,—Å—ä—é–∑
2,–∞–±–∞,–∞–±–∞,–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞,—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
3,–∞–±–∞,–∞–±–∞,–µ–¥.—á.,—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
4,–∞–±–∞—Ç–∞,–∞–±–∞,"–µ–¥.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ",—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
5,–∞–±–∏,–∞–±–∞,–º–Ω.—á.,—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
6,–∞–±–∏—Ç–µ,–∞–±–∞,"–º–Ω.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ",—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
7,–ê–±–∞–¥–∂–∏–µ–≤,–ê–±–∞–¥–∂–∏–µ–≤,–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞,—Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ
8,–ê–±–∞–¥–∂–∏–µ–≤,–ê–±–∞–¥–∂–∏–µ–≤,–º—ä–∂–∫–æ,—Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ
9,–ê–±–∞–¥–∂–∏–µ–≤–∞,–ê–±–∞–¥–∂–∏–µ–≤,–∂–µ–Ω—Å–∫–æ,—Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ


In [10]:
rows_before = df.shape[0]

df = df.drop_duplicates()

print(f"Removed {rows_before - df.shape[0]} rows from dataframe")

Removed 123607 rows from dataframe


> ##### Split column "form" into multiple columns

In [11]:
df.head()

Unnamed: 0,word,lemma,form,pos
0,–∞,–∞,–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞,—Å—ä—é–∑
2,–∞–±–∞,–∞–±–∞,–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞,—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
3,–∞–±–∞,–∞–±–∞,–µ–¥.—á.,—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
4,–∞–±–∞—Ç–∞,–∞–±–∞,"–µ–¥.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ",—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ
5,–∞–±–∏,–∞–±–∞,–º–Ω.—á.,—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ


In [12]:
sorted(list(df.form.unique()))

['–±—Ä–æ–π–Ω–∞ —Ñ–æ—Ä–º–∞',
 '–≤–∏–Ω–∏—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂',
 '–≤–∏–Ω–∏—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂, –∫—Ä–∞—Ç–∫–∞ —Ñ–æ—Ä–º–∞',
 '–¥–∞—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂',
 '–¥–∞—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂, –∫—Ä–∞—Ç–∫–∞ —Ñ–æ—Ä–º–∞',
 '–¥–µ–µ–ø—Ä–∏—á–∞—Å—Ç–∏–µ',
 '–µ–¥.—á.',
 '–µ–¥.—á., –Ω–µ–ø—ä–ª–µ–Ω —á–ª–µ–Ω',
 '–µ–¥.—á., –ø—ä–ª–µ–Ω —á–ª–µ–Ω',
 '–µ–¥.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ',
 '–∂.—Ä., –µ–¥.—á.',
 '–∂.—Ä., –µ–¥.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ',
 '–∂.—Ä., –º–Ω.—á.',
 '–∂.—Ä., –º–Ω.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ',
 '–∂–µ–Ω—Å–∫–æ',
 '–∑–≤–∞—Ç–µ–ª–Ω–∞ —Ñ–æ—Ä–º–∞',
 '–∏–º–µ–Ω–∏—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂',
 '–∫—Ä–∞—Ç–∫–∞ —Ñ–æ—Ä–º–∞',
 '–º.—Ä., –µ–¥.—á.',
 '–º.—Ä., –µ–¥.—á., –Ω–µ–ø—ä–ª–µ–Ω —á–ª–µ–Ω',
 '–º.—Ä., –µ–¥.—á., –ø—ä–ª–µ–Ω —á–ª–µ–Ω',
 '–º.—Ä., –º–Ω.—á.',
 '–º.—Ä., –º–Ω.—á., —á–ª–µ–Ω—É–≤–∞–Ω–æ',
 '–º–∏–Ω.–¥–µ—è—Ç.–Ω–µ—Å–≤.–ø—Ä–∏—á., –µ–¥.—á., –∂.—Ä.',
 '–º–∏–Ω.–¥–µ—è—Ç.–Ω–µ—Å–≤.–ø—Ä–∏—á., –µ–¥.—á., –∂.—Ä., —á–ª–µ–Ω—É–≤–∞–Ω–æ',
 '–º–∏–Ω.–¥–µ—è—Ç.–Ω–µ—Å–≤.–ø—Ä–∏—á., –µ–¥.—á., –º.—Ä.',
 '–º–∏–Ω.–¥–µ—è—Ç.–Ω–µ—Å–≤.–ø—Ä–∏—á., –µ–¥.—á., 

In [13]:
# make a backup
df_original = deepcopy(df)

In [14]:
# ü©∑move to .py file
# map value like '–º–∏–Ω.–¥–µ—è—Ç.–Ω–µ—Å–≤.–ø—Ä–∏—á., –µ–¥.—á., –º.—Ä., –Ω–µ–ø—ä–ª–µ–Ω —á–ª–µ–Ω' to new columns 'gender', 'number', 'definite_article', 'participle', etc.
form_mapping = {
    "–æ—Å–Ω–æ–≤–Ω–∞ —Ñ–æ—Ä–º–∞":          ("base", 1),
    "–±—Ä–æ–π–Ω–∞ —Ñ–æ—Ä–º–∞":           ("count form", 1),
    "–∫—Ä–∞—Ç–∫–∞ —Ñ–æ—Ä–º–∞":           ("short form", 1),
    "–º—ä–∂–∫–æ–ª–∏—á–Ω–∞ —Ñ–æ—Ä–º–∞":       ("masculine_personal_form", 1), # masculine personal form
    "–ø—Ä–∏–±–ª–∏–∑–∏—Ç–µ–ª–µ–Ω –±—Ä–æ–π":     ("approximate_number", 1), # approximate number

    "–µ–¥.—á.":         ("number", 1),
    "–º–Ω.—á.":         ("number", 2),

    "–º—ä–∂–∫–æ":         ("gender", 1),
    "–º.—Ä.":          ("gender", 1),
    "–∂–µ–Ω—Å–∫–æ":        ("gender", 2),
    "–∂.—Ä.":          ("gender", 2),
    "—Å—Ä.—Ä.":         ("gender", 3),

    "—Å–µ–≥.–≤—Ä.":       ("tense", 1), # present tense
    "–º–∏–Ω.–Ω–µ—Å–≤.–≤—Ä.":  ("tense", 2), # past imperfective tense
    "–º–∏–Ω.—Å–≤.–≤—Ä.":    ("tense", 3), # past perfective tense

    "1–ª.":           ("person", 1),
    "2–ª.":           ("person", 2),
    "3–ª.":           ("person", 3),

    "–Ω–µ–ø—ä–ª–µ–Ω —á–ª–µ–Ω":  ("definite_article", 1),
    "–ø—ä–ª–µ–Ω —á–ª–µ–Ω":    ("definite_article", 2),
    "—á–ª–µ–Ω—É–≤–∞–Ω–æ":     ("definite_article", 3),

    "—Å–µ–≥.–¥–µ—è—Ç.–ø—Ä–∏—á.":      ("participle", 1), # present active participle
    "–º–∏–Ω.–¥–µ—è—Ç.—Å–≤.–ø—Ä–∏—á.":   ("participle", 2), # past active perfective participle
    "–º–∏–Ω.–¥–µ—è—Ç.–Ω–µ—Å–≤.–ø—Ä–∏—á.": ("participle", 3), # past active imperfective participle
    "–º–∏–Ω.—Å—Ç—Ä–∞–¥.–ø—Ä–∏—á.":     ("participle", 4), # past passive participle
    "–¥–µ–µ–ø—Ä–∏—á–∞—Å—Ç–∏–µ":        ("participle", 5), # gerund/adverbial participle

    "–∏–º–µ–Ω–∏—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂":       ("case", 1), # nominative case
    "–≤–∏–Ω–∏—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂":        ("case", 2), # accusative case
    "–¥–∞—Ç–µ–ª–µ–Ω –ø–∞–¥–µ–∂":          ("case", 3), # dative case
    "–∑–≤–∞—Ç–µ–ª–Ω–∞ —Ñ–æ—Ä–º–∞":         ("case", 4), # vocative case
    "–ø–æ–≤–µ–ª–∏—Ç–µ–ª–Ω–æ –Ω–∞–∫–ª–æ–Ω–µ–Ω–∏–µ": ("case", 5), # imperative case
}

def expand_column_to_columns(df, original_column, value_to_column_mapping):
    # add new columns and initialize with zeroes
    new_columns = set(value[0] for value in value_to_column_mapping.values())
    for new_column in new_columns:
        df.loc[:, new_column] = np.zeros(len(df), dtype=int)

    # For each substring ‚Üí (column, value), create the column and remove substring from the original column
    for substring, (col, val) in value_to_column_mapping.items():
        # Regex to match the exact token (accounting for commas/whitespace)
        pattern = rf'(?:^|\s*){re.escape(substring)}(?:\s*|$)'

        # If substring is found, set the value in the column to val
        df[col] = np.where(df[original_column].str.contains(pattern, regex=True, na=False), val, df[col])

        # Remove all matched occurrences from the original column
        df.loc[:, original_column] = df[original_column].str.replace(pattern, '', regex=True)

    # Clean up commas and whitespace in the original column
    df[original_column] = (df[original_column]
                .str.replace(r',+', ',', regex=True)   # collapse multiple commas
                .str.strip(' ,'))                      # strip leading/trailing commas/spaces

    return df

In [15]:
output_file = f'{data_processed_dir}/bg_vocabulary_v1.csv'
if not LOAD_SAVED_DATA: # runs for 100 sec
    df = expand_column_to_columns(df, 'form', form_mapping)
    df = df.drop(columns=['form'])
    df.to_csv(output_file, index=False)
else:
    df = pd.read_csv(output_file)

In [16]:
assert df.shape == (1224495, 15) # üíôprint with color on success

In [17]:
!ls -lh {data_processed_dir}

total 9.8G
-rw------- 1 root root  76M Jan 21 19:07 bg_vocabulary_final.csv
-rw------- 1 root root  96M Jan 21 14:05 bg_vocabulary_v1.csv
-rw------- 1 root root 663M Jan 21 15:54 sent_wikipedia_nlp_features_final.csv
-rw------- 1 root root 100M Jan 20 09:50 sent_wikipedia_nlp_features.part01.rar
-rw------- 1 root root  56M Jan 20 09:50 sent_wikipedia_nlp_features.part02.rar
-rw------- 1 root root 8.4M Jan 23 19:23 sent_wikipedia_nlp_features_stanza_final_10000_tmp.csv
-rw------- 1 root root 649M Jan 23 19:22 sent_wikipedia_nlp_features_stanza_final.csv
-rw------- 1 root root 1.1G Jan 23 08:34 sent_wikipedia_nlp_features_stanza_v1_checkpoint80.csv
-rw------- 1 root root 1.1G Jan 23 08:35 sent_wikipedia_nlp_features_stanza_v1.csv
-rw------- 1 root root 1.2G Jan 23 15:50 sent_wikipedia_nlp_features_stanza_v2.csv
-rw------- 1 root root 1.3G Jan 21 11:11 sent_wikipedia_nlp_features_v1.csv
-rw------- 1 root root 1.4G Jan 21 15:48 sent_wikipedia_nlp_features_v2.csv
-rw------- 1 root root 1.5G

###### Encode column 'pos'

In [18]:
sorted(list(df.pos.unique()))

['–±—ä–ª–≥–∞—Ä—Å–∫–æ –≥–µ–æ–≥—Ä–∞—Ñ—Å–∫–æ –ø–æ–Ω—è—Ç–∏–µ',
 '–±—ä–ª–≥–∞—Ä—Å–∫–æ –Ω–∞—Å–µ–ª–µ–Ω–æ –º—è—Å—Ç–æ',
 '–≤—ä–ø—Ä–æ—Å–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–≥–ª–∞–≥–æ–ª',
 '–¥—ä—Ä–∂–∞–≤–∞',
 '–ª–∏—á–Ω–æ –∏–º–µ',
 '–ª–∏—á–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–º–µ–∂–¥—É–º–µ—Ç–∏–µ',
 '–º–µ—Å–µ—Ü',
 '–Ω–∞—Ä–µ—á–∏–µ',
 '–Ω–µ–æ–ø—Ä–µ–¥–µ–ª–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–æ–±–æ–±—â–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–æ—Ç–Ω–æ—Å–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–ø–æ–∫–∞–∑–∞—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '–ø—Ä–µ–¥–ª–æ–≥',
 '–ø—Ä–∏–ª–∞–≥–∞—Ç–µ–ª–Ω–æ –∏–º–µ',
 '–ø—Ä–∏—Ç–µ–∂–∞—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ',
 '—Å–≤–µ—Ç–æ–≤–Ω–æ–∏–∑–≤–µ—Å—Ç–µ–Ω –≥—Ä–∞–¥',
 '—Å–≤–µ—Ç–æ–≤–Ω–æ–∏–∑–≤–µ—Å—Ç–Ω–æ –≥–µ–æ–≥—Ä–∞—Ñ—Å–∫–æ –ø–æ–Ω—è—Ç–∏–µ',
 '—Å—Ç–æ–ª–∏—Ü–∞',
 '—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ',
 '—Å—ä—é–∑',
 '—Ç—ä—Ä–≥–æ–≤—Å–∫–∞ –º–∞—Ä–∫–∞',
 '—Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ',
 '—á–∞—Å—Ç–∏—Ü–∞',
 '—á–∏—Å–ª–∏—Ç–µ–ª–Ω–æ –±—Ä–

In [19]:
# Define a custom mapping
pos_mapping = {
    '–≥–ª–∞–≥–æ–ª':1,
    '—Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ':2,
    '–ø—Ä–∏–ª–∞–≥–∞—Ç–µ–ª–Ω–æ –∏–º–µ':3,
    '–Ω–∞—Ä–µ—á–∏–µ':4,
    '–±—ä–ª–≥–∞—Ä—Å–∫–æ –≥–µ–æ–≥—Ä–∞—Ñ—Å–∫–æ –ø–æ–Ω—è—Ç–∏–µ':5,
    '–±—ä–ª–≥–∞—Ä—Å–∫–æ –Ω–∞—Å–µ–ª–µ–Ω–æ –º—è—Å—Ç–æ':6,
    '–≤—ä–ø—Ä–æ—Å–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':7,
    '–¥—ä—Ä–∂–∞–≤–∞':8,
    '–ª–∏—á–Ω–æ –∏–º–µ':9,
    '–ª–∏—á–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':10,
    '–º–µ–∂–¥—É–º–µ—Ç–∏–µ':11,
    '–º–µ—Å–µ—Ü':12,
    '–Ω–µ–æ–ø—Ä–µ–¥–µ–ª–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':13,
    '–æ–±–æ–±—â–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':14,
    '–æ—Ç–Ω–æ—Å–∏—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':15,
    '–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':16,
    '–ø–æ–∫–∞–∑–∞—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':17,
    '–ø—Ä–µ–¥–ª–æ–≥':18,
    '–ø—Ä–∏—Ç–µ–∂–∞—Ç–µ–ª–Ω–æ –º–µ—Å—Ç–æ–∏–º–µ–Ω–∏–µ':19,
    '—Å–≤–µ—Ç–æ–≤–Ω–æ–∏–∑–≤–µ—Å—Ç–µ–Ω –≥—Ä–∞–¥':20,
    '—Å–≤–µ—Ç–æ–≤–Ω–æ–∏–∑–≤–µ—Å—Ç–Ω–æ –≥–µ–æ–≥—Ä–∞—Ñ—Å–∫–æ –ø–æ–Ω—è—Ç–∏–µ':21,
    '—Å—Ç–æ–ª–∏—Ü–∞':22,
    '—Å—ä—é–∑':23,
    '—Ç—ä—Ä–≥–æ–≤—Å–∫–∞ –º–∞—Ä–∫–∞':24,
    '—Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ':25,
    '—á–∞—Å—Ç–∏—Ü–∞':26,
    '—á–∏—Å–ª–∏—Ç–µ–ª–Ω–æ –±—Ä–æ–π–Ω–æ –∏–º–µ':27,
    '—á–∏—Å–ª–∏—Ç–µ–ª–Ω–æ —Ä–µ–¥–Ω–æ –∏–º–µ':28
 }

# Map the values
df['pos_encoded'] = df['pos'].map(pos_mapping)
print(df[['pos', 'pos_encoded']])

                             pos  pos_encoded
0                           —Å—ä—é–∑           23
1              —Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ            2
2              —Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ            2
3              —Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ            2
4              —Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ            2
...                          ...          ...
1224490        —Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ –∏–º–µ            2
1224491  —Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ           25
1224492  —Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ           25
1224493  —Ñ–∞–º–∏–ª–Ω–æ –∏–º–µ –∏–ª–∏ –ø—Ä–µ–∑–∏–º–µ           25
1224494          —Ç—ä—Ä–≥–æ–≤—Å–∫–∞ –º–∞—Ä–∫–∞           24

[1224495 rows x 2 columns]


###### Convert to lowercase

In [33]:
columns_to_lower = ['word', 'lemma']
df[columns_to_lower] = df[columns_to_lower].apply(lambda x: x.str.lower())

###### Drop columns with repeated data

In [28]:
df = df.drop(columns=['pos'])

In [21]:
if not LOAD_SAVED_DATA:
    output_file = f'{data_processed_dir}/bg_vocabulary_final.csv'
    df.to_csv(output_file, index=False)
else:
    df = pd.read_csv(output_file)

###### Experiments...

In [40]:
df[(df.lemma=='—Å–æ—Ü–∏–∞–ª–¥–µ–º–æ–∫—Ä–∞—Ç–∏—á–µ—Å–∫–∏') & (df.definite_article==1)]

Unnamed: 0,word,lemma,definite_article,number,gender,count form,approximate_number,base,short form,participle,masculine_personal_form,tense,case,person,pos_encoded


In [38]:
df.case.unique()

array([0, 4, 5, 1, 2, 3])