# Notebook for preprocessing Online (Urdu) dataset

In [26]:
import yaml
import os
import pandas as pd
from phonemize import phonemize
import phonemizer
import spacy
from tqdm import tqdm
import pickle

In [27]:
# Initialize phonemizer and tokenizer
config_path = "Configs/config.yml"
config = yaml.safe_load(open(config_path))

# Initialize phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(
    language='ur', 
    preserve_punctuation=True,  
    with_stress=True
)

In [28]:
# Initialize spacy tokenizer
nlp = spacy.blank('ur')

# Use the tokenizer attribute of the nlp object
tokenizer = nlp.tokenizer

In [39]:
def process_dataset(path):
    """
    Process all text files in the given directory and perform phonemization
    """
    # Getting all files in the directory
    files = os.listdir(path)
    
    Data_Collected = {}
    
    print("Reading input files...")
    for file in tqdm(files):
        # If txt just read and save data
        if file.endswith('.txt'):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                data = f.read()
                Data_Collected[file] = data
        # If csv read and save data
        elif file.endswith('.csv'):
            df = pd.read_csv(os.path.join(path, file), delimiter='\t', encoding='utf-8')
            Data_Collected[file] = df
        # If excel read and save data
        elif file.endswith('.xlsx'):
            df = pd.read_excel(os.path.join(path, file))
            Data_Collected[file] = df['Text']
            # Removing all emojis from the text
            Data_Collected[file] = Data_Collected[file].str.replace(r'[^\x00-\x7F]+', '', regex=True)
    
    # Combining all to one list
    dataset = []
    for key in Data_Collected.keys():
        if isinstance(Data_Collected[key], pd.DataFrame):
            dataset.extend(Data_Collected[key].values.tolist())
        elif isinstance(Data_Collected[key], pd.Series):
            dataset.extend(Data_Collected[key].tolist())
        else:
            dataset.append(Data_Collected[key])
    
    print(f"Total number of texts to process: {len(dataset)}")
    
    import re
    # Process the entire dataset
    processed_data = []
    print("Processing texts with phonemizer...")
    for text in tqdm(dataset):
        if isinstance(text, str):  # Only process if it's a string
            nlp = spacy.blank('ur')
                
            # chunking the text into smaller parts
            if len(text) > 1000:
                processed = []
                # splitting on the occurence of full stops or new lines or '؟'
                chunks = re.split(r'[.؟\n]', text)
                for chunk in chunks:
                    if len(chunk) > 0:
                        processed.append(nlp(chunk))
                processed_data.append(processed)
            else:
                processed = nlp(text)
            processed_data.append(processed)
    
    return processed_data

In [None]:
# Process dataset
input_path = 'Data/To Use'
processed_data = process_dataset(input_path)

Reading input files...


 67%|██████▋   | 2/3 [00:01<00:00,  1.84it/s]

In [30]:
def save_processed_dataset(processed_data, output_path):
    """
    Save the processed dataset
    """
    import pickle
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    with open(output_path, 'wb') as f:
        pickle.dump(processed_data, f)
    
    print(f'Dataset saved to {output_path}')

In [31]:
def process_token_mapping(processed_data, config):
    """
    Process token mapping for the dataset
    """
    special_token = config['dataset_params']['word_separator']
    
    # Get all unique tokens
    print("Collecting unique tokens...")
    unique_tokens = {special_token}
    for item in tqdm(processed_data):
        if isinstance(item, dict):  # Ensure the item is a dictionary
            for token in item.get('tokens', []):
                unique_tokens.add(token)
    
    unique_tokens = list(unique_tokens)
    
    # Get lower case tokens
    print("Processing token cases...")
    token_maps = {}
    for t in tqdm(unique_tokens):
        word = tokenizer.decode([t])
        word = word.lower()
        new_t = tokenizer.encode(word)[0]
        token_maps[t] = {'word': word, 'token': new_t}
    
    # Save token mapping
    with open(config['dataset_params']['token_maps'], 'wb') as handle:
        pickle.dump(token_maps, handle)
    print(f'Token mapper saved to {config["dataset_params"]["token_maps"]}')

In [None]:
# Save processed dataset
output_path = os.path.join(config['data_folder'], 'processed_dataset.pkl')
save_processed_dataset(processed_data, output_path)

In [None]:
# Process token mapping
process_token_mapping(processed_data, config)

In [None]:
print("Processing complete!")