In [85]:
import re

Feel free to check the sourcecode at: 

In [86]:
class RegularExpressionProcessor:
    def __init__(self):
        self.expressions = []

    def add_expression(self, regex, operation_type, replacement=None):
        self.expressions.append((regex, operation_type, replacement))

    def replace(self, regex, string, value):
        return re.sub(regex, value, string)

    def find(self, regex, string):
        return re.findall(regex, string)

    def delete(self, regex, string):
        return re.sub(regex, '', string)

    def process(self, data):
        processed_data = []
        for text in data:
            processed_text = text
            for regex, operation_type, replacement in self.expressions:
                if operation_type == 'replace':
                    processed_text = self.replace(regex, processed_text, replacement)
                elif operation_type == 'find':
                    matches = self.find(regex, processed_text)
                    # You can do something with matches here if needed
                elif operation_type == 'delete':
                    processed_text = self.delete(regex, processed_text)
            processed_data.append(processed_text)
        return processed_data


In [87]:
# Example usage:
processor = RegularExpressionProcessor()

# Add regular expressions for processing toxic comments dataset
processor.add_expression(r'@\w+', 'delete')  # Delete media usernames
processor.add_expression(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'delete')  # Delete emails
processor.add_expression(r'\b\d{10}\b', 'delete')  # Delete phone numbers
processor.add_expression(r'\b[A-Z][a-z]*\b', 'delete')  # Delete company names

# Normalize words and delete noisy expressions
processor.add_expression(r'(.)\1{2,}', 'replace', r'\1')  # Normalize repeated characters within words
processor.add_expression(r'[^\w\s]+', 'delete')  # Delete non-word characters
processor.add_expression(r'\s+', 'replace', ' ')  # Normalize whitespace


In [88]:
# Process data
data = [
    "a comment with @username and an email user@example.com. 1234567890",
    "another toxic comment with mispelled words like imbecile imbeeeeeeeecile.",
    "noisy expressions like \n newline \t tab and non-ascii characters.",
    "um that is your opinion i belive DBZ tenchaichi 3 and the last of the original bodikai for the PS2 bodikai 3 is just as good as virtua fighter,",
    "tekken, DOA and other well known fighting games. So come into reality and notice that everything you think is fact that is just base on what you think is highly unimpaortant and is mediocre ",
    "you used to get beaten on and you belive what you say goes without any research (more clearly without say) Get your facts straight LOL ......,  ."
    "HE IS A GHAY ASS FUCKER@@!!",
    ":::::::::You can keep reverting me to push your WP:Undue viewpoints til I take it to WP:BLPN if you want. I won't discuss it any longer. '' - ðŸ—½",
    "Ð Ð°Ð´Ñ‡ÐµÐ½ÐºÐ¾ Ð¸Ñ Ð¼Ñ‹ Ð´Ð¸Ñ€Ñ‚Ñ‹ Ð³Ð°Ñ‹ Ð°Ð½Ð°Ð» ÑÐµÑ… ÑÐ»Ð°Ð²Ðµ!!",
    "Ø­Ø§ÙˆÙ„ Ù‚Ø±Ø§Ø¡Ø© ØµÙØ­Ø§Øª Ø§Ù„Ù…Ø³Ø§Ø¹Ø¯Ø© ÙˆØ§Ù„ØªØ¹Ù„ÙŠÙ…Ø§Øª", # These are ASCII characters, "Ùˆ" (Hex: C3 99, Dec: 195 153), "Øª" (Hex: C3 98, Dec: 195 152) etc.
    "â€” Preceding unsigned comment added by" # In this examples "€" is not an ASCII character
]
processed_data = processor.process(data)

# Print processed data
for text in processed_data:
    print(text)

a comment with and an email usercom 
another toxic comment with mispelled words like imbecile imbecile
noisy expressions like newline tab and nonascii characters
 that is your opinion i belive DBZ tenchaichi 3 and the last of the original bodikai for the PS2 bodikai 3 is just as good as virtua fighter
tekken DOA and other well known fighting games come into reality and notice that everything you think is fact that is just base on what you think is highly unimpaortant and is mediocre 
you used to get beaten on and you belive what you say goes without any research more clearly without say your facts straight LOL HE IS GHAY ASS FUCKER
 can keep reverting me to push your WP viewpoints til take it to WPBLPN if you want wont discuss it any longer ðŸ½
Ð ÐÐÑÐµÐ½ÐºÐ¾ ÐÑ Ð¼Ñ ÐÐÑÑÑ Ð³ÐÑ ÐÐ½ÐÐ ÑÐµÑ ÑÐÐÐ²Ðµ
ØØÙˆÙ ÙØØØØ ØµÙØØØª ØÙÙØ³ØØ¹ØØ ÙˆØÙØªØ¹ÙÙŠÙØØª
â unsigned comment added by
