<a href="https://colab.research.google.com/github/MuhammadHelmyOmar/ArabicPIIRedaction/blob/main/data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

## Training data

In [1]:
from google.colab import drive
import random
import pandas as pd

### Loading Data

In [None]:
drive.mount('/content/drive')

In [None]:
# All Data

data_path = "/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/ALL_DATA.csv"

train = pd.read_csv(data_path)

In [None]:
print(len(train))
train.head()

In [None]:
print(train['dialect'].unique())
print(train['dialect'].value_counts())

In [None]:
train.loc[train['dialect']=='0', 'dialect'] = 'eg'
print(train['dialect'].value_counts())



---



In [None]:
# Data augmented with Arabic names and locations

names_loc_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/names_locations_augmented_data.csv")

print(len(names_loc_data))
names_loc_data.head()

In [None]:
train = pd.concat([train, names_loc_data], ignore_index=True)
print(len(train))
train['dialect'].value_counts()

In [None]:
# Shuffle the data

train = train.sample(frac=1).reset_index(drop=True)

### Cleaning and preprocessing

In [None]:
import random
import re
from IPython.display import display, HTML
import re
import ast

In [None]:
idx = random.randint(0, len(train))
idx

In [None]:
print(train.tokens[idx])
print(type(train.tokens[idx]))
print(train.tags[idx])
print(type(train.tags[idx]))
print(len(train.tokens[idx]))
print(len(train.tags[idx]))

In [None]:
tokens_list = ast.literal_eval(train.tokens[idx])
print(tokens_list)
print(type(tokens_list), len(tokens_list))

tags_list = ast.literal_eval(train.tags[idx])
print(tags_list)
print(type(tags_list), len(tags_list))

In [None]:
# Convert tokens and tags columns to lists

train['tokens'] = train["tokens"].apply(ast.literal_eval)
train['tags'] = train['tags'].apply(ast.literal_eval)

In [None]:
print(train.tokens[idx])
print(type(train.tokens[idx]))
print(train.tags[idx])
print(type(train.tags[idx]))
print(len(train.tokens[idx]))
print(len(train.tags[idx]))

In [None]:
# Checking if there is mismatching lengths between tokens and tags

length_comparison_result = train.apply(lambda row: len(train.tokens) == len(train.tags), axis=1)
print(f"\nNumber of rows with mismatching lengths: {sum(~length_comparison_result)}")

In [None]:
tags_to_mask = ['PHONEIMEI',
 'VEHICLEVRM',
 'LITECOINADDRESS',
 'CREDITCARDNUMBER',
 'DATE',
 'NEARBYGPSCOORDINATE',
 'BITCOINADDRESS',
 'GENDER',
 'PERSONNAME',
 'JOBTITLE',
 'TIME',
 'CURRENCY',
 'BIC',
 'MASKEDNUMBER',
 'STREET',
 'MAC',
 'DOB',
 'SECONDARYADDRESS',
 'CREDITCARDISSUER',
 'ZIPCODE',
 'USERAGENT',
 'CURRENCYSYMBOL',
 'JOBTYPE',
 'BUILDINGNUMBER',
 'AGE',
 'MIDDLENAME',
 'CREDITCARDINUMBER',
 'ACCOUNTNUMBER',
 'PIN',
 'FIRSTNAME',
 'ORDINALDIRECTION',
 'PASSWORD',
 'PHONENUMBER',
 'IPV4',
 'CREDITCARDCVV',
 'USERNAME',
 'HEIGHT',
 'CURRENCYCODE',
 'ACCOUNTNAME',
 'IBAN',
 'AMOUNT',
 'PREFIX',
 'VEHICLEVIN',
 'SEX',
 'EMAIL',
 'ETHEREUMADDRESS',
 'IPV6',
 'SSN',
 'URL',
 'LASTNAME',
 'CURRENCYNAME',
 'IP']

In [None]:
def masker(row, tags_to_mask, discrepancy_list):
    """
    Constructs a masked sentence and records discrepancies, attempting to preserve original formatting.

    Args:
    row: A pandas DataFrame row with 'tokens', 'tags', and 'clean_source' columns.
    tags_to_mask: List of tag entity types to mask.
    discrepancy_list: A list to append discrepancy details.

    Returns:
    The constructed sentence with masked tokens for the given row, attempting to preserve spacing.
    """
    tokens = row['tokens']
    tags = row['tags']
    clean_source = row['clean_source']
    masked_sentence = ""
    current_position = 0

    for i in range(len(tokens)):
        token = tokens[i]
        tag = tags[i]

        # Find the position of the current token in the original string starting from the last processed position
        start_index = clean_source.find(token, current_position)

        if start_index != -1:
            # Append the text from the current position up to the start of the token
            masked_sentence += clean_source[current_position:start_index]

            if tag[2:] in tags_to_mask:
                masked_sentence += '[MASK]'
            else:
                masked_sentence += token

            # Update the current position to the end of the current token
            current_position = start_index + len(token)
        else:
            # If token not found at or after current_position, it indicates a discrepancy
            print(f"Warning: Token '{token}' not found in clean_source at or after position {current_position}\n")

            # Record the discrepancy
            discrepancy_list.append({
                'clean_source': clean_source,
                'tokens': tokens,
                'tags': tags,
                'discrepancy_token': token,
                'discrepancy_position': current_position
            })

            # You might choose how to handle the token that wasn't found -
            # here, we'll just append it unmasked to keep the process going
            masked_sentence += token
            # We don't update current_position based on this token as it wasn't found correctly

    # Append any remaining text after the last token
    masked_sentence += clean_source[current_position:]
    return masked_sentence

In [None]:
# Initialize a list to store discrepancy information
discrepancy_data = []

# Creating a new masked sentence
train['masked_source'] = train.apply(lambda row: masker(row, tags_to_mask, discrepancy_data), axis=1)

# Convert the list of discrepancy data into a DataFrame
discrepancy_df = pd.DataFrame(discrepancy_data)

In [None]:
print(f"{len(discrepancy_df)} discrepancies are found")

In [None]:
print(f"{len(discrepancy_df.clean_source.unique())} sentences are detected as discrepancies.")

In [None]:
# Drop discrepancies

print(f"Original number of rows: {len(train)}")

train = train[~train['clean_source'].isin(discrepancy_df['clean_source'])]

print(f"Number of rows after dropping discrepancies: {len(train)}")

In [None]:
train[['clean_source','masked_source']]

In [None]:
train_final = train[['clean_source', 'masked_source']].copy()
train_final.rename(columns={'clean_source': 'source', 'masked_source': 'target'}, inplace = True)

# train_final.head()

# # Save the train data after the updates
# train_final.to_csv('/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv', index=False)

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv")

In [None]:
def merge_consecutive_masks(text):
    """Merges consecutive occurrences of '[MASK]' in a string, preserving a trailing whitespace if present."""
    return re.sub(r'(\[MASK\](\s*)){2,}', r'[MASK]\2', text)

def display_ar_eng(text):
    """Display Arabic and English text in a readable format"""
    display(HTML(f'<div dir="rtl" style="font-size:18px; line-height:1.8; font-family: "Arial", sans-serif;">{text}</div>'))

In [None]:
train_data['target'] = train_data['target'].apply(merge_consecutive_masks)

display_ar_eng(train_data['source'][0])
display_ar_eng(train_data['target'][0])

In [None]:
# Save the train data after the updates
# train_data.to_csv('/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv', index=False)