v0.1

> TODO:
- input:
  - f"{data_clean_dir}/bg_fiction_all.tsv"
- output:
  - f'{data_processed_dir}/sent_fiction_nlp_features_part1_v1_checkpoint.tsv'
  - f'{data_processed_dir}/sent_fiction_nlp_features_part1_v1.tsv'
  - f'{data_processed_dir}/sent_fiction_nlp_features_part1_v2.tsv'
  - f'{data_processed_dir}/sent_fiction_nlp_features_part1_v3_final.tsv'    
- df.copy(deep=True)
- remove the warning, start_time
- add descriptions + fix headings
- remove yellow
- extract all functions to py files
- run locally
- check typos

In [2]:
!pip install ufal.udpipe
# !pip install stanza
# !pip install spacy-stanza



In [3]:
import ast
import os
import pandas as pd
import polars as pl
import sys
import time
import urllib.request

from ufal.udpipe import Model, Pipeline, ProcessingError

In [4]:
!if [ ! -f "/content/helpers/save_load_checkpoint_files.py" ]; then wget -P helpers/ https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/save_load_checkpoint_files.py; fi
!if [ ! -f "/content/helpers/parse_nlp_features.py" ]; then wget -P helpers/ https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/parse_nlp_features.py; fi
!if [ ! -f "/content/helpers/parse_nlp_morphtags.py" ]; then wget -P helpers/ https://raw.githubusercontent.com/MirkaIvanova/Projects/refs/heads/main/the-grammar-whisperer/helpers/parse_nlp_morphtags.py; fi


In [5]:
IS_GUEST = False
LOAD_SAVED_DATA = True

In [6]:
if IS_GUEST:
    root_dir = '.'
else:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    root_dir = "/content/drive/MyDrive/softuni/the-grammar-whisperer"

data_clean_dir = f"{root_dir}/data/clean"
data_processed_dir = f"{root_dir}/data/processed"
helpers_dir = f"{root_dir}/helpers"

if root_dir not in sys.path:
    sys.path.append(root_dir)
    sys.path.append(helpers_dir)

bg_fiction_clean_csv =  f"{data_clean_dir}/bg_fiction_all.tsv"

Mounted at /content/drive


In [7]:
from save_load_checkpoint_files import save_checkpoint, load_checkpoint
from parse_nlp_morphtags import parse_nlp_morphtags
from parse_nlp_features import process_chunk

# Add lemmas and pos tagging to fiction sentences

#### Load clean csv with sentences

In [6]:
df_polars = pl.read_csv(bg_fiction_clean_csv, separator='\t')
df = df_polars.to_pandas()
del df_polars

df = df.rename(columns={"text": "sentence"})
df.shape

(6690845, 1)

#### Initialize UDPipe and download Bulgarian model

In [None]:
# Download the UDPipe Bulgarian model
model_path = "bulgarian-btb-ud-2.5-191206.udpipe"

if not os.path.exists(model_path):
    model_url_udpipe = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/bulgarian-btb-ud-2.5-191206.udpipe?sequence=6&isAllowed=y"
    urllib.request.urlretrieve(model_url_udpipe, model_path)

model = Model.load(model_path)
pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

###### Function to process one sentence into pos, features, etc.

In [None]:
# 💙mii move to py
def convert_udpipe_to_spacy(text):
    doc_str = pipeline.process(text)

    lines = doc_str.strip().splitlines()
    words = []
    lemmas = []
    spaces = []
    pos_tags = []
    morph_tags = []
    features = []
    dep_rels = []

    for line in lines:
        if line.startswith("#") or not line.strip():
            continue

        parts = line.split("\t")
        index, word, lemma, pos, tag, feats, head, dep_rel, _, misc = parts

        words.append(word)
        lemmas.append(lemma)
        pos_tags.append(pos)  # POS tag (simpler POS category)
        morph_tags.append(tag)  # Detailed morphological tag
        features.append(feats)
        dep_rels.append(dep_rel)

        if "SpaceAfter=No" in misc:
            spaces.append(False)
        else:
            spaces.append(True)

    return words, lemmas, pos_tags, morph_tags, features, dep_rels, len(words)

###### Process the entire clean dataframe in chunks using checkpoints

In [None]:
# 💙mii move to .py file
def find_unprocessed_indices(df, col):
    """Finds the indices of rows in the DataFrame where the column is NaN, assuming those rows are unprocessed."""

    # Check if the column exists
    if col not in df.columns:
        return df.index.tolist()

    # Find unprocessed rows where the value in the column is NaN
    unprocessed_mask = df[col].isna()
    unprocessed_indices = df[unprocessed_mask].index.tolist()

    return unprocessed_indices

def process_dataframe_with_checkpoints(_df, process_func, chunk_size=1000, output_path='final_output.csv', sep='.'):
    """
    Process a DataFrame in chunks with checkpoint capability using vectorized operations.

    Args:
        _df: Input DataFrame
        process_func: Function to apply to each chunk
        chunk_size: Number of rows to process in each chunk
        checkpoint_path: Path to save checkpoint files
        output_path: Path to save final output
    """

    df, checkpoint_filename = load_checkpoint(output_path, sep=sep)
    if df is None:
        df = _df.copy()
        print("Starting fresh processing...")
    else:
        print(f"Loaded checkpoint {checkpoint_filename} with {len(df)} rows")

    if False:
        if os.path.exists(checkpoint_path):
            df = pd.read_csv(checkpoint_path, sep=sep)
            print(f"Loaded checkpoint with {len(df)} rows")
        else:
            df = _df.copy()
            print("Starting fresh processing")

    # Find which rows are not processed yet
    # Assuming if 'words' column is NaN, the row hasn't been processed
    # unprocessed_mask = df['words'].isna()
    # unprocessed_indices = df[unprocessed_mask].index
    unprocessed_indices = find_unprocessed_indices(df, 'words')

    if len(unprocessed_indices) == 0:
        print("All rows already processed")
        df.to_csv(output_path, index=False, sep=sep)
        return df

    total_chunks = (df.shape[0] + chunk_size - 1) // chunk_size

    current_chunks = (len(unprocessed_indices) + chunk_size - 1) // chunk_size

    for current_chunk_num in range(current_chunks):
        try:
            chunk_start = current_chunk_num * chunk_size
            chunk_end = min((current_chunk_num + 1) * chunk_size, len(unprocessed_indices))
            current_indices = unprocessed_indices[chunk_start:chunk_end]
            start_idx = current_indices[0]
            end_idx = current_indices[-1]

            print(f"Processing chunk {current_chunk_num + 1}/{current_chunks} (indices [{start_idx}-{end_idx}]/{df.shape[0]})")

            # Process current chunk in a vectorized way
            chunk_sentences = df.loc[current_indices, 'sentence']
            results = chunk_sentences.apply(process_func).apply(pd.Series)

            # Update the dataframe with processed results
            results.columns = ["words", "lemmas", "pos", "morph", "features", "dep", "n_words"]
            df.loc[current_indices, results.columns] = results

            # Save checkpoint after each chunk
            latest_filename = save_checkpoint(output_path, df, sep=sep)
            print(f"Saved checkpoint {latest_filename}")

        except KeyboardInterrupt:
            print("\nProcessing interrupted. Progress saved in checkpoint file.")
            save_checkpoint(output_path, df, sep=sep)
            return df
        except Exception as e:
            print(f"Error processing chunk: {e}")
            save_checkpoint(output_path, df, sep=sep)
            raise

    # All processing complete, save final output
    unprocessed_indices = find_unprocessed_indices(df, 'words')
    if len(unprocessed_indices) == 0:
        print("Processing complete")
        df.to_csv(output_path, index=False, sep=sep)

    return df

#### Split into two parts to be processed in parallel

In [7]:
df.shape

(6690845, 1)

In [8]:
# split the df into two dataframes for parallel processing
df1 = df.iloc[:len(df)//2]
df2 = df.iloc[len(df)//2:]

print(f"df1 shape: {df1.shape}")
print(f"df2 shape: {df2.shape}")

df1 shape: (3345422, 1)
df2 shape: (3345423, 1)


##### Part 1: Add pos and morph tagging using UDPipe

In [None]:
output_path = f'{data_processed_dir}/sent_fiction_nlp_features_part1_v1.tsv'

if not LOAD_SAVED_DATA:
    chunk_size = 10000
    df_pos_udpipe_part1 = process_dataframe_with_checkpoints(
        df1,
        convert_udpipe_to_spacy,
        chunk_size=chunk_size,
        output_path=output_path,
        sep = '\t'
    )

##### Part 1: Add gender, number, person

> explain that this extracts gender, number and person

In [None]:
if not LOAD_SAVED_DATA:
    input_path = f'{data_processed_dir}/sent_fiction_nlp_features_part1_v1.tsv'
    output_path = f'{data_processed_dir}/sent_fiction_nlp_features_part1_v2.tsv'
    chunk_size = 100_000

    # Initialize CSV reader with chunks
    reader = pd.read_csv(input_path, chunksize=chunk_size, low_memory=True, sep="\t")

    # Process and write chunks incrementally
    for i, chunk in enumerate(reader):
        processed_chunk = process_chunk(chunk)
        # Write header only for the first chunk
        mode = "w" if i == 0 else "a"
        header = i == 0
        processed_chunk.to_csv(output_path, mode=mode, header=header, index=False, sep='\t')
        print(f"Processed chunk {i+1} with {len(processed_chunk)} rows")

##### Part 1: Add case

In [None]:
input_path = f'{data_processed_dir}/sent_fiction_nlp_features_part1_v2.tsv'

if not LOAD_SAVED_DATA:
    df_pos_udpipe_part1 = pd.read_csv(input_path, low_memory=True, sep="\t")
    df_pos_udpipe_part1["morph"] = df_pos_udpipe_part1["morph"].apply(ast.literal_eval)
    results = df_pos_udpipe_part1["morph"].apply(parse_nlp_morphtags).apply(pd.Series)
    results.columns = ["case"]
    df_pos_udpipe_part1[results.columns] = results

##### Part 1: Convert to lowercase

In [None]:
if not LOAD_SAVED_DATA:
    columns_to_lower = ['sentence', 'words', 'lemmas']
    df_pos_udpipe_part1[columns_to_lower] = df_pos_udpipe_part1[columns_to_lower].apply(lambda x: x.str.lower())

##### Part 1: Remove columns with repeated information and save the final version

In [None]:
output_path = f'{data_processed_dir}/sent_fiction_nlp_features_part1_v3_final.csv'

if not LOAD_SAVED_DATA:
    df_pos_udpipe_part1_no_repeat = df_pos_udpipe_part1.drop(columns=['morph', 'features', 'dep'])

    df_pos_udpipe_part1_no_repeat.to_csv(output_path, index=False, sep='\t')

###### Part 2: Add pos and morph tagging using UDPipe

In [None]:
output_path = f'{data_processed_dir}/sent_fiction_nlp_features_part2_v1.tsv'

if not LOAD_SAVED_DATA:
    chunk_size = 10000
    df_pos_udpipe_part2 = process_dataframe_with_checkpoints(
        df2,
        convert_udpipe_to_spacy,
        chunk_size=chunk_size,
        output_path=output_path,
        sep = '\t'
    )

###### Part 2: Add gender, number, person

> explain that this extracts gender, number and person
> 💙mii rename process_chunk to process_features


In [None]:
if not LOAD_SAVED_DATA:
    input_path = f'{data_processed_dir}/sent_fiction_nlp_features_part2_v1.tsv'
    output_path = f'{data_processed_dir}/sent_fiction_nlp_features_part2_v2.tsv'
    chunk_size = 100_000

    # Initialize CSV reader with chunks
    reader = pd.read_csv(input_path, chunksize=chunk_size, low_memory=True, sep="\t")

    # Process and write chunks incrementally
    for i, chunk in enumerate(reader):
        processed_chunk = process_chunk(chunk)
        # Write header only for the first chunk
        mode = "w" if i == 0 else "a"
        header = i == 0
        processed_chunk.to_csv(output_path, mode=mode, header=header, index=False, sep='\t')
        print(f"Processed chunk {i+1} with {len(processed_chunk)} rows")

###### Part 2: Add case

In [8]:
input_path = f'{data_processed_dir}/sent_fiction_nlp_features_part2_v2.tsv'

if not LOAD_SAVED_DATA:
    df_pos_udpipe_part2 = pd.read_csv(input_path, low_memory=True, sep="\t")
    df_pos_udpipe_part2["morph"] = df_pos_udpipe_part2["morph"].apply(ast.literal_eval)
    results = df_pos_udpipe_part2["morph"].apply(parse_nlp_morphtags).apply(pd.Series)
    results.columns = ["case"]
    df_pos_udpipe_part2[results.columns] = results

In [9]:
df_pos_udpipe_part2 = pd.read_csv(input_path, low_memory=True, sep="\t")

###### Part 2: Convert to lowercase

In [10]:
if not LOAD_SAVED_DATA:
    columns_to_lower = ['sentence', 'words', 'lemmas']
    df_pos_udpipe_part2[columns_to_lower] = df_pos_udpipe_part2[columns_to_lower].apply(lambda x: x.str.lower())

###### Part 2: Remove columns with repeated information and save the final version

In [11]:
output_path = f'{data_processed_dir}/sent_fiction_nlp_features_part2_v3_final.csv'

if not LOAD_SAVED_DATA:
    df_pos_udpipe_part2_no_repeat = df_pos_udpipe_part2.drop(columns=['morph', 'features', 'dep'])

    df_pos_udpipe_part2_no_repeat.to_csv(output_path, index=False, sep='\t')