Hello and welcome to another file we're working on together! 😀

In this one, I'm sharing some handy (and sometimes life-saving) functions for data cleaning in NLP tasks.
Ready to clean up that messy text? Let's dive in!

In [3]:
pip install fasttext-langdetect

Collecting fasttext-langdetect
  Downloading fasttext-langdetect-1.0.5.tar.gz (6.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fasttext>=0.9.1 (from fasttext-langdetect)
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext>=0.9.1->fasttext-langdetect)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext-langdetect, fasttext
  Building wheel for fasttext-langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext-langdetect: filename=fasttext_langdetect-1.0.5-py3-none-any.whl size=7504 sha256=3b9da1b8cce7e1aa3982b7

In [24]:
class TextCleaner:
    """
    A collection of text cleaning functions for NLP tasks, including removing non-UTF8 characters,
    dates, times, URLs, and more. This class also initializes a language prediction model for
    checking if text is in English.
    """

    def __init__(self, model=None):
        """
        Initializes the TextCleaner class, importing necessary modules and setting up
        a language model for identifying English text.

        Args:
            model: A language prediction model for detecting if text is in English.
                   If no model is provided, the English detection functionality will be unavailable.
        """
        # Import necessary modules
        global re
        import re  # Importing regex module for text cleaning

        #self.model = model  # Setting up the language model for English identification

    def is_english(self, text: str) -> bool:
        """
        Checks if the given text is in English using a language prediction model.

        Args:
            text (str): The text to be analyzed.
            model: A language prediction model that returns a language code for the text.

        Returns:
            bool: True if the text is in English, False otherwise.
        """
        # Predicting the language code using the model and joining the result into a single string
        from ftlangdetect import detect

        result = detect(text, low_memory=False)

        # Checking if the predicted language is English ('en')
        return result['lang'] == 'en'

    def remove_hashtags(self, text: str) -> str:
        """
        Removes hashtags and mentions (starting with # or @) from the given text.

        Args:
            text (str): The input text from which hashtags and mentions should be removed.

        Returns:
            str: The cleaned text with hashtags and mentions removed.
        """
        # Using regex to remove any word that starts with # or @, including any leading spaces
        cleaned_text = re.sub(r"(?:^|\s)[＃#@]{1}(\w+)", '', text)

        return cleaned_text

    def remove_non_utf8(self, text: str) -> str:
        """
        Removes non-UTF8 and non-ASCII characters from the input text.
        Also removes apostrophes and any unwanted symbols, leaving only ASCII characters.

        Args:
            text (str): The input text from which non-UTF8 and non-ASCII characters should be removed.

        Returns:
            str: The cleaned text with only valid ASCII characters.
        """
        # Remove a specific UTF-8 encoded apostrophe pattern
        text = re.sub(r'\xe2\x80\x99', '', text)

        # Remove non-ASCII characters (those outside the range of \x00 to \x7f)
        text = re.sub(r'[^\x00-\x7f]+', '', text)

        return text

    def remove_anything_between(self, text: str) -> str:
        """
        Removes any text between parentheses and also removes HTML tags from the input text.

        Args:
            text (str): The input text from which content between parentheses and HTML tags should be removed.

        Returns:
            str: The cleaned text without content between parentheses and without HTML tags.
        """
        # Remove anything inside parentheses
        text = re.sub(r'\((.*?)\)', '', text)

        # Remove HTML tags by finding anything between angle brackets and replacing it with a space
        text = re.sub(r'<.*?>', ' ', text)

        return text

    def remove_date(self, text: str) -> str:
        """
        Replaces different date formats in the text with the word 'date'.

        Args:
            text (str): The input text from which dates should be removed.

        Returns:
            str: The text with dates replaced by the word 'date'.
        """
        # Replace ISO format dates like "2007-05-20" with 'date'
        text = re.sub(r'\d{4}-\d{2}-\d{2}', 'date', text)

        # Replace dates in the format 20-05-2007 or 05-20-07 with 'date'
        text = re.sub(r'\d{2}-\d{2}-\d{2,4}', 'date', text)

        # Replace dates in the format 20/05/2007 with 'date'
        text = re.sub(r'\d{2}/\d{2}/\d{2,4}', 'date', text)

        # Replace dates in the format 20-May-2007 with 'date'
        text = re.sub(r'\d{2}-[A-Za-z]{3,}-\d{2,4}', 'date', text)

        # Replace dates with ordinal numbers like "20th May 2007", "1st", "2nd", "3rd" with 'date'
        text = re.sub(r'\d{1,2}(st|nd|rd|th)? [A-Za-z]+ \d{4}', 'date', text)

        # Replace dates in the format "Sunday, May 20, 2007" or "20 May 2007" with 'date'
        text = re.sub(r'([A-Za-z]+,)?(\s*[A-Za-z]+ \d{1,2},?\s*\d{4})', 'date', text)

        return text

    def remove_hour(self, text: str) -> str:
        """
        Replaces time expressions in the text with the word 'hour'.

        Args:
            text (str): The input text from which hours (with AM/PM) should be removed.

        Returns:
            str: The text with time expressions replaced by the word 'hour'.
        """
        # Replace hours like "02:30 PM", "11 am", "11 AM", or "11 p.m." with 'hour'
        text = re.sub(r'((([0-9]{2}:[0-9]{2})|(\d{2}))\s*(am|pm|AM|PM|p.m.|a.m.))', 'hour', text)

        return text

    def remove_url(self, text: str) -> str:
        """
        Removes URLs from the input text.

        Args:
            text (str): The input text from which URLs should be removed.

        Returns:
            str: The text with URLs replaced by a space.
        """
        # Simplified regex to match most common URL formats
        text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

        return text


    def remove_emails(self, text: str) -> str:
        """
        Removes email addresses from the input text.

        Args:
            text (str): The input text from which email addresses should be removed.

        Returns:
            str: The text with email addresses replaced by a space.
        """
        text = re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)
        return text

**Brief explanations for regex expressions:**

1. From the *remove_hashtags* function:

`r"(?:^|\s)[＃#@]{1}(\w+)"`

- `(?:^|\s)` : This looks for either the start of the line (^) or any space (\s) before the hashtag or mention. We're not capturing this part, just checking.
- `[＃#@]{1}` : Matches exactly one # or @ symbol (including the wide version ＃).
- `(\w+)` : Captures the word (letters, numbers, or underscores) right after the hashtag or mention.



`r'[^\x00-\x7f]+'`

- This strips out any characters that aren’t part of basic ASCII, like emojis 🐠, accented letters (é), or special symbols. Only standard English characters, numbers, and common symbols are kept.


2. From the *remove_non_utf8* function:

`r'\xe2\x80\x99'`

- This removes the fancy apostrophe (’) that often shows up in text copied from the web or word processors. It's the UTF-8 version of a regular apostrophe.


3. From the *remove_anything_between* function:

`r'\((.*?)\)'`

Finds and removes anything inside parentheses (). The `.*?` is like saying "grab just the stuff inside the closest pair of parentheses without being greedy."

`r'<.*?>'`


This one looks for HTML tags (anything between < >) and removes them. The `.*?` ensures it only grabs one tag at a time, so it doesn't go wild and remove too much!

4. From the remove_date function:

`r'([A-Za-z]+,)?(\s*[A-Za-z]+ \d{1,2},?\s*\d{4})'`

- `([A-Za-z]+,)?` : This part optionally matches a day of the week (like "Monday") followed by a comma, but it’s not mandatory.
- `\s*[A-Za-z]+` : Matches the full month name (like "February") with optional leading spaces (\s*).
- `\d{1,2}` : Matches the day of the month, either one or two digits (e.g., "5" or "05").
- `,?\s*` : Matches an optional comma and any extra spaces after the day.
- `\d{4}` : Matches the four-digit year (e.g., "2008").

`r'\d{1,2}(st|nd|rd|th)? [A-Za-z]+ \d{4}'`


- `\d{1,2}` : Matches the day of the month, which can be 1 or 2 digits (e.g., 1, 23).
- `(st|nd|rd|th)?` : Matches the ordinal suffix (st, nd, rd, th) if present (e.g., 1st, 2nd, 3rd, 4th), but it's optional (?).
- `[A-Za-z]+` : Matches the month name (e.g., February).
- `\d{4}` : Matches the four-digit year (e.g., 2008)

5. From the remove_hour function:

`r'((([0-9]{2}:[0-9]{2})|(\d{2}))\s*(am|pm|AM|PM|p.m.|a.m.))'`

- `([0-9]{2}:[0-9]{2})` : Matches time in the format 02:30, with two digits before and after the colon.
- `(\d{2})` : Matches time written with only two digits (like 11 for 11 o'clock).
- `\s*` : Matches any optional space between the time and the AM/PM marker.
- `(am|pm|AM|PM|p.m.|a.m.)` : Matches AM/PM in different formats, such as lowercase (am), uppercase (PM), or even with periods (a.m., p.m.).

6. From the remove_url function:

`r'https?://\S+|www\.\S+'`

- `https?://` : Matches http:// or https:// at the beginning of the URL.
- `\S+` : Matches any sequence of non-whitespace characters after the http:// or https:// (the rest of the URL).
- `|` : The OR operator, to match different patterns.
www\.\S+ : Matches URLs starting with www. followed by any sequence of non-whitespace characters.

In [25]:
# Define the TextCleaner class (as provided earlier)
cleaner = TextCleaner()

# Example paragraph for testing
text = "Hey everyone! I just wanted to share my experience. I visited https://example.com at 02:30 PM on 1st February 2023,  and it was fantastic! #BestExperienceEver Also, I attended an event on 12-11-2018 (don't ask about that one, though).   You can contact me at john.doe@example.com or check my website www.johndoe.com. My meeting tomorrow is at 09:00 am.    See you at 3rd January 2022 or maybe on 5th March, 2021. Cheers!"

#text = "Bugün hava çok güzel"

#text = "Check out https://example.com at 02:30 PM, and on 1st February 2023!"

# Run all cleaning methods step by step
cleaned_text = cleaner.remove_url(text)            # Remove URLs
cleaned_text = cleaner.remove_hour(cleaned_text)   # Remove time expressions
cleaned_text = cleaner.remove_date(cleaned_text)   # Remove date expressions
cleaned_text = cleaner.remove_hashtags(cleaned_text)  # Remove hashtags
cleaned_text = cleaner.remove_anything_between(cleaned_text)  # Remove text in parentheses and HTML tags (if any)
cleaned_text = cleaner.remove_emails(cleaned_text)  # Remove emails
cleaned_text = cleaner.remove_non_utf8(cleaned_text)  # Remove non-UTF8 characters

# Check if the cleaned text is in English
is_english = cleaner.is_english(cleaned_text)

# Output the cleaned text and language detection result
print("Cleaned Text:\n", cleaned_text)
print("\nIs the text in English?:", is_english)


Cleaned Text:
 Hey everyone! I just wanted to share my experience. I visited   at hour on date,  and it was fantastic! Also, I attended an event on date .   You can contact me at   or check my website   My meeting tomorrow is at hour.    See you at date or maybe on 5th March, 2021. Cheers!

Is the text in English?: True
