In [1]:
!pip install ufal.udpipe

Collecting ufal.udpipe
  Downloading ufal.udpipe-1.3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading ufal.udpipe-1.3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (936 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/936.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/936.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m931.8/936.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m936.8/936.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ufal.udpipe
Successfully installed ufal.udpipe-1.3.1.1


In [2]:
import numpy as np
import os
import pandas as pd
import urllib.request
import warnings

import re
import time

from copy import copy, deepcopy
from ufal.udpipe import Model, Pipeline, ProcessingError

In [3]:
IS_GUEST = False
LOAD_SAVED_DATA = True # If False, running this notebook takes ~ 3 hours (Google Colab/CPU).

In [4]:
if IS_GUEST:
    root_dir = '.'
else:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    root_dir = "/content/drive/MyDrive/Colab Notebooks/bg"

data_raw_dir = f"{root_dir}/data/raw"
data_clean_dir = f"{root_dir}/data/clean"
data_processed_dir = f"{root_dir}/data/processed"
# data_processed_dir = f"./data/processed"

wl_wiki_2021_1M_part1 = f'{data_clean_dir}/wl_bul_wikipedia_2021_1M-sentences_part1.csv'
wl_wiki_2021_1M_part2 = f'{data_clean_dir}/wl_bul_wikipedia_2021_1M-sentences_part2.csv'

Mounted at /content/drive


In [5]:
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning) # 🧡

###### Functions for saving and loading a checkpoint (❤️move to .py file)


In [6]:
import os
import glob
import re

def save_checkpoint(base_filename, data, N=5):
    """
    Save a checkpoint file while maintaining only the last N versions.

    Args:
        base_filename (str): Base name of the file (e.g., 'model.pt')
        data: The data to save
    """
    # Get existing checkpoints
    name, ext = os.path.splitext(base_filename)
    pattern = f"{name}_checkpoint[0-9]*{ext}"
    existing_files = glob.glob(pattern)

    # Extract checkpoint numbers
    numbers = []
    for file in existing_files:
        match = re.search(rf"{name}_checkpoint(\d+){ext}", file)
        if match:
            numbers.append(int(match.group(1)))

    # Determine next checkpoint number
    next_num = 1 if not numbers else max(numbers) + 1

    # Save new checkpoint
    checkpoint_name = f"{name}_checkpoint{next_num}{ext}"
    data.to_csv(checkpoint_name, index=False)

    # Remove old checkpoints if more than N
    if len(existing_files) >= N:
        checkpoint_files = [(f, int(re.search(rf"{name}_checkpoint(\d+){ext}", f).group(1)))
                          for f in existing_files]
        checkpoint_files.sort(key=lambda x: x[1])

        # Remove oldest files until only 4 remain (plus the new one we just added)
        while len(checkpoint_files) >= N:
            os.remove(checkpoint_files[0][0])
            checkpoint_files.pop(0)

    return checkpoint_name

def load_checkpoint(base_filename, sep=','):
    """
    Load the latest checkpoint file.

    Args:
        base_filename (str): Base name of the file (e.g., 'model.pt')

    Returns:
        The loaded data from the latest checkpoint
    """
    name, ext = os.path.splitext(base_filename)
    pattern = f"{name}_checkpoint[0-9]*{ext}"
    existing_files = glob.glob(pattern)

    if not existing_files:
        if os.path.exists(base_filename):
            print(f"~~~ loading base file: {base_filename}")
            return pd.read_csv(base_filename, sep=sep), base_filename
        else:
            return None, None

    print(f"~~~ existing_files: {existing_files}")

    # Find the highest checkpoint number
    checkpoint_files = [(f, int(re.search(rf"{name}_checkpoint(\d+){ext}", f).group(1)))
                       for f in existing_files]
    latest_file = max(checkpoint_files, key=lambda x: x[1])[0]

    # Your existing load function here, e.g.:
    print(f"~~~ loading latest checkpoint: {latest_file}")
    return pd.read_csv(latest_file, sep=sep), latest_file

## Add lemmas and pos tagging to sentences

In [7]:
start_time = time.time()
print(f"Execution time: {time.time() - start_time} seconds")

Execution time: 8.940696716308594e-05 seconds


###### Load clean csv with sentences

In [8]:
df1, df2 = pd.read_csv(wl_wiki_2021_1M_part1, sep='|'), pd.read_csv(wl_wiki_2021_1M_part2, sep='|')

In [9]:
df_clean = pd.concat([df1, df2], ignore_index=True)  # Combine and regenerate a new index
del df1 # delete so we don't accidentally refer to it in subsequent code
del df2

In [10]:
for x in df_clean.iloc[0:10]['sentence']:
    print(x)

Става световен шампион в тежка категория през 2007 г.
Лечението е трудно и продължително.
Въпросните тестови случаи се сформират на база спецификациите и изискванията, т.е. въз основа на това за какво е предназначена дадено приложение.
През този сезон 2009 клубът носи името "Гаосюн Яоти".
От всички дъгови бои при колумбийските географското разпространение е най-северно.
Официално днес носи името "Свети Модест" .
Достъпът до двигателя се осъществява през люк в десния борд.
Между българите има повече от 1 млн. мохамедани.
Млекопроизводството и отглеждането на едър рогат добитък и свине също става важно да селскостопанската икономика на района.
Възможни са проблеми, когато изследването на психичния статус се прилага, когато клиницистът и пациентът са от различен културен произход.


In [11]:
df_clean.shape

(787244, 1)

##### ➡️Split to words and lemmas using UDPipe

In [12]:
# Download the UDPipe Bulgarian model
model_url_udpipe = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/bulgarian-btb-ud-2.5-191206.udpipe?sequence=6&isAllowed=y"
model_path = "bulgarian-btb-ud-2.5-191206.udpipe"
urllib.request.urlretrieve(model_url_udpipe, model_path)

model = Model.load(model_path)
pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

In [13]:
def extract_gender_from_tag_unused(tag):
    gender = " "
    # gender is applicable to common noun, proper noun or adjective
    if tag.startswith(("Nc", "Np", "A")):
        if "m" in tag:
            gender = "Masculine"
        elif "f" in tag:
            gender = "Feminine"
        elif "n" in tag:
            gender = "Neutral"
    return gender

# TODO: need to exclude triple character patterns like "p1s": "Past tense, 1st person sng",
def extract_number_from_tag_unused(tag):
    number = None
    if "s" in tag:
        number = "Singular"
    elif "p" in tag:
        number = "Plural"
    return number


###### Function to process one sentence into pos, features, etc.

In [14]:
def udpipe_to_spacy(text):
    """Parses a CoNLL-U formatted string into a spaCy Doc object."""
    doc_str = pipeline.process(text)

    lines = doc_str.strip().splitlines()
    words = []
    lemmas = []
    spaces = []
    pos_tags = []
    morph_tags = []
    features = []
    dep_rels = []

    for line in lines:
        if line.startswith("#") or not line.strip():
            continue

        parts = line.split("\t")
        index, word, lemma, pos, tag, feats, head, dep_rel, _, misc = parts

        words.append(word)
        lemmas.append(lemma)
        pos_tags.append(pos)  # POS tag (simpler POS category)
        morph_tags.append(tag)  # Detailed morphological tag
        features.append(feats)
        dep_rels.append(dep_rel)

        if "SpaceAfter=No" in misc:
            spaces.append(False)
        else:
            spaces.append(True)

    # mii❤️do we need to return n_words and n_lemmas here?
    return words, lemmas, pos_tags, morph_tags, features, dep_rels, len(words), len(lemmas)

###### Process 1 chunk of the clean dataframe

In [15]:
start_time = time.time()
df10_u = df_clean.iloc[0:1000]
df10_u[["words", "lemmas", "pos", "morph", "features", "dep", "n_words", "n_lemmas"]] = df10_u['sentence'].apply(udpipe_to_spacy).apply(pd.Series)
print(f"Execution time: {time.time() - start_time} seconds")

Execution time: 17.806679010391235 seconds


In [16]:
print(df10_u.iloc[0]['words'])
print(df10_u.iloc[0]['lemmas'])
print(df10_u.iloc[0]['morph'])
print(df10_u.iloc[0]['features'])

['Става', 'световен', 'шампион', 'в', 'тежка', 'категория', 'през', '2007', 'г.']
['ставам', 'световен', 'шампион', 'в', 'тежък', 'категория', 'през', '2007', 'година']
['Vniif-r3s', 'Amsi', 'Ncmsi', 'R', 'Afsi', 'Ncfsi', 'R', 'Mofsi', 'Ncfsi']
['Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', 'Definite=Ind|Degree=Pos|Gender=Masc|Number=Sing', 'Definite=Ind|Gender=Masc|Number=Sing', '_', 'Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing', 'Definite=Ind|Gender=Fem|Number=Sing', '_', 'Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing|NumType=Ord', 'Definite=Ind|Gender=Fem|Number=Sing']


###### Process the entire clean dataframe in chunks using checkpoints

In [17]:
def find_unprocessed_indices(df, col):
    """Finds the indices of rows in the DataFrame where the column is NaN, assuming those rows are unprocessed."""

    # Check if the column exists
    if col not in df.columns:
        return df.index.tolist()

    # Find unprocessed rows where the value in the column is NaN
    unprocessed_mask = df[col].isna()
    unprocessed_indices = df[unprocessed_mask].index.tolist()

    return unprocessed_indices

def process_dataframe_with_checkpoints(_df, process_func, chunk_size=1000, output_path='final_output.csv'):
    """
    Process a DataFrame in chunks with checkpoint capability using vectorized operations.

    Args:
        _df: Input DataFrame
        process_func: Function to apply to each chunk
        chunk_size: Number of rows to process in each chunk
        checkpoint_path: Path to save checkpoint files
        output_path: Path to save final output
    """

    print(f"~~~ {output_path}")

    df, checkpoint_filename = load_checkpoint(output_path)
    if df is None:
        df = _df.copy()
        print("Starting fresh processing...")
    else:
        print(f"Loaded checkpoint {checkpoint_filename} with {len(df)} rows")

    if False:
        if os.path.exists(checkpoint_path):
            df = pd.read_csv(checkpoint_path)
            print(f"Loaded checkpoint with {len(df)} rows")
        else:
            df = _df.copy()
            print("Starting fresh processing")

    # Find which rows are not processed yet
    # Assuming if 'words' column is NaN, the row hasn't been processed
    # unprocessed_mask = df['words'].isna()
    # unprocessed_indices = df[unprocessed_mask].index
    unprocessed_indices = find_unprocessed_indices(df, 'words')

    if len(unprocessed_indices) == 0:
        print("All rows already processed")
        df.to_csv(output_path, index=False)
        return df

    total_chunks = (df.shape[0] + chunk_size - 1) // chunk_size

    current_chunks = (len(unprocessed_indices) + chunk_size - 1) // chunk_size

    for current_chunk_num in range(current_chunks):
        try:
            chunk_start = current_chunk_num * chunk_size
            chunk_end = min((current_chunk_num + 1) * chunk_size, len(unprocessed_indices))
            current_indices = unprocessed_indices[chunk_start:chunk_end]
            start_idx = current_indices[0]
            end_idx = current_indices[-1]

            print(f"Processing chunk {current_chunk_num + 1}/{current_chunks} (indices [{start_idx}-{end_idx}]/{df.shape[0]})")

            # Process current chunk in a vectorized way
            chunk_sentences = df.loc[current_indices, 'sentence']
            results = chunk_sentences.apply(process_func).apply(pd.Series)

            # Update the dataframe with processed results
            results.columns = ["words", "lemmas", "pos", "morph", "features", "dep", "n_words", "n_lemmas"]
            df.loc[current_indices, results.columns] = results

            # Save checkpoint after each chunk
            # df.to_csv(checkpoint_path, index=False)
            latest_filename = save_checkpoint(output_path, df)
            print(f"Saved checkpoint {latest_filename}")

        except KeyboardInterrupt:
            print("\nProcessing interrupted. Progress saved in checkpoint file.")
            # df.to_csv(checkpoint_path, index=False)
            save_checkpoint(output_path, df)
            return df
        except Exception as e:
            print(f"Error processing chunk: {e}")
            # df.to_csv(checkpoint_path, index=False)
            save_checkpoint(output_path, df)
            raise

    # All processing complete, save final output
    unprocessed_indices = find_unprocessed_indices(df, 'words')
    if len(unprocessed_indices) == 0:
        print("Processing complete")
        df.to_csv(output_path, index=False)

    return df

In [18]:
chunk_size = 10000
output_path = f'{data_processed_dir}/sent_wikipedia_nlp_features.csv'

In [19]:
# this runs for ~ 3 hours on Google Colab/CPU
if not LOAD_SAVED_DATA:
    df_processed = process_dataframe_with_checkpoints(
        df_clean,
        udpipe_to_spacy,
        chunk_size=chunk_size,
        output_path=output_path
    )
else:
    df_processed = pd.read_csv(output_path)

In [21]:
df_clean.shape, df_processed.shape, df_processed.columns

((787244, 1),
 (787244, 9),
 Index(['sentence', 'words', 'lemmas', 'pos', 'morph', 'features', 'dep',
        'n_words', 'n_lemmas'],
       dtype='object'))

In [22]:
df_processed.head()

Unnamed: 0,sentence,words,lemmas,pos,morph,features,dep,n_words,n_lemmas
0,Става световен шампион в тежка категория през ...,"['Става', 'световен', 'шампион', 'в', 'тежка',...","['ставам', 'световен', 'шампион', 'в', 'тежък'...","['VERB', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', ...","['Vniif-r3s', 'Amsi', 'Ncmsi', 'R', 'Afsi', 'N...",['Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Ten...,"['root', 'amod', 'obj', 'case', 'amod', 'obl',...",9.0,9.0
1,Лечението е трудно и продължително.,"['Лечението', 'е', 'трудно', 'и', 'продължител...","['Лечение', 'съм', 'трудно', 'и', 'продължител...","['NOUN', 'AUX', 'ADJ', 'CCONJ', 'ADJ', 'PUNCT']","['Ncnsd', 'Vxitf-r3s', 'Ansi', 'Cp', 'Ansi', '...","['Definite=Def|Gender=Neut|Number=Sing', 'Aspe...","['nsubj', 'cop', 'root', 'cc', 'conj', 'punct']",6.0,6.0
2,Въпросните тестови случаи се сформират на база...,"['Въпросните', 'тестови', 'случаи', 'се', 'сфо...","['Въпросен', 'тестов', 'случай', 'се', 'сформи...","['ADJ', 'ADJ', 'NOUN', 'PRON', 'VERB', 'ADP', ...","['A-pd', 'A-pi', 'Ncmpi', 'Ppxta', 'Vpitf-r3p'...","['Definite=Def|Degree=Pos|Number=Plur', 'Defin...","['amod', 'amod', 'nsubj', 'expl', 'ccomp', 'ca...",23.0,23.0
3,"През този сезон 2009 клубът носи името ""Гаосюн...","['През', 'този', 'сезон', '2009', 'клубът', 'н...","['през', 'този', 'сезон', '2009', 'клуб', 'нос...","['ADP', 'DET', 'NOUN', 'NUM', 'NOUN', 'VERB', ...","['R', 'Pde-os-m', 'Ncmsi', 'Mc-pi', 'Ncmsf', '...","['_', 'Gender=Masc|Number=Sing|PronType=Dem', ...","['case', 'det', 'obl', 'nummod', 'nsubj', 'roo...",12.0,12.0
4,От всички дъгови бои при колумбийските географ...,"['От', 'всички', 'дъгови', 'бои', 'при', 'колу...","['от', 'всеки', 'дъгов', 'бо', 'при', 'колумби...","['ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'A...","['R', 'Pce-op', 'A-pi', 'Ncmpi', 'R', 'A-pd', ...","['_', 'Case=Nom|Number=Plur|PronType=Tot', 'De...","['case', 'det', 'amod', 'nsubj', 'case', 'amod...",11.0,11.0
