In [None]:
! pip install datasets emoji fasttext

In [2]:
from datasets import load_dataset

ds = load_dataset("ayoubkirouane/Algerian-Darija" , split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2324 [00:00<?, ? examples/s]

Generating v1 split:   0%|          | 0/168655 [00:00<?, ? examples/s]

In [6]:
# rename text Column
ds = ds.rename_column("Text", "text")

In [7]:
ds

Dataset({
    features: ['text'],
    num_rows: 2324
})

# Remove Emojis

In [None]:
import emoji

# Function to remove emojis
def remove_emojis(example):
    example['text'] = emoji.replace_emoji(example['text'], replace='')
    return example

ds = ds.map(remove_emojis)

Map:   0%|          | 0/2324 [00:00<?, ? examples/s]

# Remove Emails / Phone Numbers / special characters / English words  /  non-Arabic words

In [None]:
import re
def clean(example):
    example['text'] = re.sub(r'http[s]?://\S+', '', example['text'])  # Remove URLs
    example['text'] = re.sub(r'[^\w\s]', '', example['text'])  # Remove special characters (anything that is not a word character or whitespace)
    example['text'] = re.sub(r'\b[A-Za-z]+\b', '', example['text'])  # Remove English words
    example['text'] = re.sub(r'\b[^\u0600-\u06FF\s]+\b', '', example['text'])  # Remove non-Arabic words (anything that is not Arabic)
    example['text'] = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', example['text'])  # Remove email addresses
    example['text'] = re.sub(r'\b\d{10,15}\b', '', example['text'])  # Remove phone numbers (10 to 15 digits)
    example['text'] = re.sub(r'\n+', '', example['text'])  # Replace multiple newlines with a single space
    example['text'] = re.sub(r'\s+', ' ', example['text']).strip()  # Replace multiple spaces (including newlines) with a single space and strip leading/trailing spaces

    return example

ds = ds.map(clean)

Map:   0%|          | 0/2324 [00:00<?, ? examples/s]

# Remove examples that are too short

In [None]:
import heapq

def paragraph_length_filter(x):
    """Returns False iff a page has too few lines or lines are too short."""
    lines = x['text'].split('\n')
    if (
        len(lines) < 1
        or min(heapq.nlargest(3, [len(line) for line in lines])) < 3
    ):
        return False
    return True
dataset = ds.filter(
    paragraph_length_filter,
    load_from_cache_file=False
)
dataset.num_rows

Filter:   0%|          | 0/2324 [00:00<?, ? examples/s]

2295

# Remove repeated text within training examples

In [None]:
import re

def find_duplicates(paragraphs):
    """
    Use this function to find the number of repetitions
    in the paragraphs.
    """
    unique_x = set()
    duplicate_chars = 0
    duplicate_elements = 0
    for element in paragraphs:
        if element in unique_x:
            duplicate_chars += len(element)
            duplicate_elements += 1
        else:
            unique_x.add(element)
    return duplicate_elements, duplicate_chars
def paragraph_repetition_filter(x):
    """
    Returns False iff a page has too many repetitions.
    """
    text = x['text']
    paragraphs = re.compile(r"\n{2,}").split(text.strip())                # Split by paragraphs (2 or more newlines)
    paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)  # Find number of duplicates in paragraphs
    if paragraphs_duplicates / len(paragraphs) > 0.3:
        return False
    if char_duplicates / len(text) > 0.2:
        return False
    return True

dataset = dataset.filter(
    paragraph_repetition_filter,
    load_from_cache_file=False
)
dataset.num_rows

Filter:   0%|          | 0/2295 [00:00<?, ? examples/s]

2295

# Remove Deduplication

In [None]:
def deduplication(ds):
    def dedup_func(x):
        """Use this function to remove duplicate entries"""
        if x['text'] in unique_text:
            return False
        else:
            unique_text.add(x['text'])
            return True

    unique_text = set()

    ds = ds.filter(dedup_func, load_from_cache_file=False, num_proc=1)
    return ds

dataset = deduplication(dataset)
dataset.num_rows

Filter:   0%|          | 0/2295 [00:00<?, ? examples/s]

2280

# Quality filter - Language

In [None]:
!wget -q https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

In [None]:
import urllib
from fasttext.FastText import _FastText

def language_filter(ds):
    # load language detection model
    model = _FastText('lid.176.bin')

    def is_darija(x):
        # Predict language of the text and probability
        language, score = model.predict(x['text'].replace("\n", ""))

        language = language[0].split("__")[2]
        return score > 0.4 and language == "ar" # change code here if building a model in another language

    ds = ds.filter(is_darija, load_from_cache_file=False, num_proc=1)
    return ds

dataset = language_filter(dataset)

Filter:   0%|          | 0/2288 [00:00<?, ? examples/s]