# Regex processor class

### References for normalization
* https://towardsdatascience.com/text-normalization-7ecc8e084e31

In [1]:
import re 
import pandas as pd
from nltk.corpus import words
from fuzzywuzzy import fuzz
from word_forms.lemmatizer import lemmatize

In [2]:
class RegularExpressionProcessor:
    """
    Class to procces a regular expression and perform find, replace and delete operations given a tuple containing that information
    
    """
    
    
    def __init__(self, expressions):
        """
        Class initializer
        
        input:
        expressions: list of tuples containing the information for the operations, the first variable of the tuple is a regular expression
                     the second is one of the defined operations and in case the operation is replace it has to have a third variable that 
                     is the value to replace
                     
        attributes: 
        self.expressions: list of tuples containing the information of the input expressions
        """
        
        self.__valid_expression(expressions)
            
        self.expressions = expressions
        
        self.string_expressions = dict()
        
        
    def __valid_expression(self, expressions):
        """
        Private Method to check whether the list of tuples for the expressions is valid or not
        
        input: 
        expressions: list of tuples containing the information for the operations, the first variable of the tuple is a regular expression
                     the second is one of the defined operations and in case the operation is replace it has to have a third variable that 
                     is the value to replace
                
        output:
        return nothing if the list is valid, however, if is unvalid it gives the warning of where the error is.
        """
        for expression in expressions:
            if len(expression) < 2:
                raise Exception("Each tuple has to have at least two values")
                
            if expression[1] != "replace" and expression[1] != "find" and expression[1] != "delete" and expression[1] != "lower":
                raise Exception("The operation in the tuple is not defined")
                
            if expression[1] == "replace" and len(expression) < 3:
                raise Exception("The operation replace needs the value to replace")
                
            if type(expression[0]) != str:
                raise Exception("The first value of the tuple is not a string")
         
    def replace(self, regular_expression, string, value):
        """
        Method to process the replace function using a regular_expression
        
        input:
        regular_expressions: regular expression to mach the desired strings
        string: string of all the words where replace is going to be applied
        value: value in which the matched strings are going to be replaced for
        
        output:
        returns the new_tring with the function applied
        """
        
        new_string = re.sub(regular_expression,value,string)
        return new_string
    
    def find(self, regular_expression, string):
        """
        Method to process the find function using a regular_expression
        
        input:
        regular_expressions: regular expression to mach the desired strings
        string: string of all the words where find is going to be applied
        
        output:
        returns the string matching the regular expression
        """
        
        all_matches = []
        
        for match in re.finditer(regular_expression, string):
            all_matches.append(match.group())
            
        return all_matches     
      
    def delete(self, regular_expression, string):
        """
        Method to process the delete function using a regular_expression
        
        input:
        regular_expressions: regular expression to mach the desired strings
        string: string of all the words where delete is going to be applied
        
        output:
        returns the new_tring with the function applied
        """
        new_string = re.sub(regular_expression,'', string)
        return new_string
    
    def process(self, data):
        """
        Method to proccess an iterable set of strings applying to each string the function defined in the attribute self.expressions
        
        Input:
        data: iterable set of strings
        
        output:
        return the iterable set of string with the function already applied
        """
        
        self.string_expressions = dict()
        
        new_data = list()
        
        for each_string in data:
            self.string_expressions[each_string] = []
            
            aux = each_string
            
            for expression in self.expressions:
                if expression[1] == "lower":
                    aux = aux.lower()
                    
                elif expression[1] == "replace":
                    aux = self.replace(expression[0], aux, expression[2])
                    
                elif expression[1] == "find":
                    aux = self.find(expression[0], aux)
                    aux = ''.join(aux)
                    
                elif expression[1] == "delete":
                    aux = self.delete(expression[0], aux)
                        
                
                else:
                    raise Error["The valid function did not work correctly, you have a function not define"]
                    
                
            new_data.append(aux)
            
        return new_data

# Checking the class using the practice regex seen in class

In [3]:
list_tuples = [("\+[0-9]{12}", "replace", "a phone number"), ("((\w)*\.?)+@((\w)*\.?)+\.(mx|com)", "replace", "an email"), ("C.V.", "delete")]

In [4]:
proccesor = RegularExpressionProcessor(list_tuples)

In [5]:
proccesor.expressions

[('\\+[0-9]{12}', 'replace', 'a phone number'),
 ('((\\w)*\\.?)+@((\\w)*\\.?)+\\.(mx|com)', 'replace', 'an email'),
 ('C.V.', 'delete')]

In [6]:
pruebas = set(["Soy mario mi numero es +529994263828 y mi email es mario.campos@upy.edu.mx y mario.campos.soberanis@gmail.com y trabajo en soldai sapi de C.V."])

In [7]:
pruebas

{'Soy mario mi numero es +529994263828 y mi email es mario.campos@upy.edu.mx y mario.campos.soberanis@gmail.com y trabajo en soldai sapi de C.V.'}

In [8]:
proccesor.process(pruebas)

['Soy mario mi numero es a phone number y mi email es an email y an email y trabajo en soldai sapi de ']

# Test in DataSet

In [9]:
data = pd.read_csv("data/train_automata.csv")
data = data[["comment_text"]]

In [10]:
data

Unnamed: 0,comment_text
0,Explanation\nWhy the edits made under my usern...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\nMore\nI can't make any real suggestions on ..."
4,"You, sir, are my hero. Any chance you remember..."
...,...
29995,"Thermal and chemical aren't half bad, to be ho..."
29996,This person has vanadalized a page about Joe L...
29997,"""::::You've deemed those of us who disagree wi..."
29998,I am making case for its inclusion. Some of t...


### Using the class to parse the stopwords, copied from a website, and making the regex for eliminating them

In [11]:
expression_tuples = [("(\n|\t|\xa0)", "replace", " "),
                     (r"[^\x00-\x7F]", "delete"),
                     ("[\w]+| ", "find"),
                     (" +", "replace", " ")
                     ]

In [12]:
processor_data = RegularExpressionProcessor(expression_tuples)

In [13]:
x = ["""
a
about
above
after
again
against
all
am
an
and
any
are
aren't
as
at
be
because
been
before
being
below
between
both
but
by
can't
cannot
could
couldn't
did
didn't
do
does
doesn't
doing
don't
down
during
each
few
for
from
further
had
hadn't
has
hasn't
have
haven't
having
he
he'd
he'll
he's
her
here
here's
hers
herself
him
himself
his
how
how's
i
i'd
i'll
i'm
i've
if
in
into
is
isn't
it
it's
its
itself
let's
me
more
most
mustn't
my
myself
no
nor
not
of
off
on
once
only
or
other
ought
our
ours
ourselves
out
over
own
same
shan't
she
she'd
she'll
she's
should
shouldn't
so
some
such
than
that
that's
the
their
theirs
them
themselves
then
there
there's
these
they
they'd
they'll
they're
they've
this
those
through
to
too
under
until
up
very
was
wasn't
we
we'd
we'll
we're
we've
were
weren't
what
what's
when
when's
where
where's
which
while
who
who's
whom
why
why's
with
won't
would
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
"""]


In [14]:
x

["\na\nabout\nabove\nafter\nagain\nagainst\nall\nam\nan\nand\nany\nare\naren't\nas\nat\nbe\nbecause\nbeen\nbefore\nbeing\nbelow\nbetween\nboth\nbut\nby\ncan't\ncannot\ncould\ncouldn't\ndid\ndidn't\ndo\ndoes\ndoesn't\ndoing\ndon't\ndown\nduring\neach\nfew\nfor\nfrom\nfurther\nhad\nhadn't\nhas\nhasn't\nhave\nhaven't\nhaving\nhe\nhe'd\nhe'll\nhe's\nher\nhere\nhere's\nhers\nherself\nhim\nhimself\nhis\nhow\nhow's\ni\ni'd\ni'll\ni'm\ni've\nif\nin\ninto\nis\nisn't\nit\nit's\nits\nitself\nlet's\nme\nmore\nmost\nmustn't\nmy\nmyself\nno\nnor\nnot\nof\noff\non\nonce\nonly\nor\nother\nought\nour\nours\nourselves\nout\nover\nown\nsame\nshan't\nshe\nshe'd\nshe'll\nshe's\nshould\nshouldn't\nso\nsome\nsuch\nthan\nthat\nthat's\nthe\ntheir\ntheirs\nthem\nthemselves\nthen\nthere\nthere's\nthese\nthey\nthey'd\nthey'll\nthey're\nthey've\nthis\nthose\nthrough\nto\ntoo\nunder\nuntil\nup\nvery\nwas\nwasn't\nwe\nwe'd\nwe'll\nwe're\nwe've\nwere\nweren't\nwhat\nwhat's\nwhen\nwhen's\nwhere\nwhere's\nwhich\nwhile\

In [15]:
stopwords = processor_data.process(x)
stopwords = stopwords[0].split(" ")
stopwords.pop(0)
stopwords[0:10]

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and']

In [16]:
#Creating the regex for deleting the stopwords
string = ''
for i in stopwords:
    i = r'\b'+i+r'\b'
    if string != "":
        string = string + "|" + i
        
    else:
        string = i

stopwords_tuple = [(string, "delete")]

# Final class to make the pre pro for toxic comments

In [17]:
expression_tuples = [("\+[0-9]{12}|[0-9]{10}", "replace", "a phone number"),
                     ("https?:\/\/[\w\/\.~#]*", "replace", "a hyperlink"),
                     ("[A-Za-z0-9]+ ?[A-Za-z0-9]+?, (Inc|CO|FAC).", "replace", " a company "),
                     ("(@[a-zA-Z0-9_.]+|[a-zA-Z0-9_.]+#[0-9]+)", "replace", "a username"),
                      ("^[a-z0-9]+[._]?[a-z0-9]+[@]\w+[.]\w{2,3}$", "replace", "an email"),
                     ("(\n|\t|\xa0)", "replace", " "),
                     (r"[^\x00-\x7F]", "replace", " "),
                     ("[\w]+| ", "find"),
                     (r'([a-z])\1+', 'replace', r'\1\1'),
                     ("","lower")
                     ]
expression_tuples.extend(stopwords_tuple)
expression_tuples.extend([(" +", "replace", " ")])

* (\n|\t|\xa0): regex to eliminate enter, tab, and xa0 for non-breaking space
* r\[^\x00-\x7F\]: regex to eliminate non-ascii chracters, based on stackoverflow
* [\w]+| : regex to eliminate non-alpha numeric characters, read on the article when suggest about just using alpha numeric characters
* ([a-z])\1+: regex to eliminate same letter consicutively more than 2 times because in english there are rules that do not allow the same letter consicutively more than 2 times, for example, if it finds imbeeeecile, it will transform it into imbeecile, something that will help in the mispelling functions

#### Disclaimer: the username it might not detect all because it will only detect usernames that starts with @, same with company too many cases to think for companies name terminations

In [18]:
processor_data = RegularExpressionProcessor(expression_tuples)

In [19]:
union = processor_data.process(data.comment_text.tolist()[:100])
union

['explanation edits made username hardcore metallica fan reverted vandalisms just closure gas voted new york dolls fac please remove template talk page since retired now892053827',
 'daww matches background colour seemingly stuck thanks talk 2151 january 11 2016 utc',
 'hey man really trying edit war just guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
 ' make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc can later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagood_article_nominationstransport ',
 ' sir hero chance remember page ',
 ' congratulations use tools talk ',
 'cocksucker piss around work',
 ' vandalism matt shirvington article reverted please will banned',
 'sorry word nonsense 

In [20]:
def incorrect_words_searching(strings):
    """
    Function to find words with possible mispellings
    
    input:
    string: list containing strings where to look for the mispellings
    
    """
    #making the list into a whole string
    strings = ' '.join(strings)
    strings = strings.lower()
    #getting the words list from nlkt
    words_list = words.words()
    words_list = ' '.join(words_list)
    words_list = words_list.lower()
    words_list = words_list.split(' ')
    incorrect = []
    for i in set(strings.split(' ')):
        if i != "" and i != " ":
            if i in words_list:
                pass
            #if the word is not in the word list
            else:
                try:
                    #try to get it to the rout form, if possible the word is correct
                    lemmatize(i)
                except:
                    try:
                        #try to check if the word could be actually a number
                        float(i)
                    except:
                        #if not the word has mispelling
                        incorrect.append(i)

        else:
            pass
        
    return incorrect

In [21]:
incorrects = incorrect_words_searching(union)
incorrects[:5]

['antonin', 'deadend', 'now892053827', 'wikipediafair', 'arabs']

In [22]:
def mispellings(incorrect_words):
    """
    Function to find the canonical word for a mispelling using the levenshtein distance, 
    
    input:
    incorrect_words: list containing the incorret words
    
    output:
    return a list of tuples, containing the mispelling word, the operation replace and the canonical word
    
    disclaimer: the canonical word could not be the actual canonical because some words it might not be mispelling, but the dictionary does 
    not have it
    """
    correction = []
    for i in incorrect_words:
        #temporal variables
        temp = 0
        temp_word = ''
        words_list = words.words()
        words_list = ' '.join(words_list)
        words_list = words_list.lower()
        words_list = words_list.split(' ')
        for word in words_list:
            aux = fuzz.ratio(i, word)
            if aux > 80 and aux > temp:
                temp = aux
                temp_word = word

        if temp_word != '':
            correction.append((i,"replace",temp_word))
        
    return correction

In [23]:
expression_tuples = mispellings(incorrects)
expression_tuples 

[('antonin', 'replace', 'anthonin'),
 ('deadend', 'replace', 'deaden'),
 ('arabs', 'replace', 'arabis'),
 ('harlan', 'replace', 'harleian'),
 ('possibillity', 'replace', 'possibility'),
 ('everybodys', 'replace', 'everybody'),
 ('etc', 'replace', 'etch'),
 ('syrthiss', 'replace', 'syrtis'),
 ('engineeringly', 'replace', 'engineering'),
 ('appearence', 'replace', 'apparence'),
 ('shwain', 'replace', 'swain'),
 ('itselfmaking', 'replace', 'tasselmaking'),
 ('dept', 'replace', 'adept'),
 ('unsubscribe', 'replace', 'unsubscribed'),
 ('mainspace', 'replace', 'midspace'),
 ('barnstar', 'replace', 'barnstorm'),
 ('cardozo', 'replace', 'cardoon'),
 ('expresident', 'replace', 'president'),
 ('kasuga', 'replace', 'kashga'),
 ('sternhell', 'replace', 'stenchel'),
 ('sityush', 'replace', 'situs'),
 ('inc', 'replace', 'inca'),
 ('homie', 'replace', 'home'),
 ('mcdonald', 'replace', 'donald'),
 ('asia', 'replace', 'asian'),
 ('ammended', 'replace', 'amende'),
 ('petras', 'replace', 'petrea'),
 ('wpl

In [24]:
processor_data2 = RegularExpressionProcessor(expression_tuples)

In [25]:
processor_data2.process(union)

['explanation edits made undername hardcore metallical fan reverted vandalisms just closure gas voted new york dolls face please remove template talk page sincae retired now892053827',
 'daw matches background colour seemingly stuck thanks talk 2151 january 11 2016 utch',
 'hey man really trying edit war just guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
 ' make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etch can later none else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagood_article_nominationstransport ',
 ' sir hero chance remember page ',
 ' congratulations use tools talk ',
 'cocksucker piss around work',
 ' vandalism matt shirvington article reverted please will banned',
 'sorry word nonse

# Another example

In [26]:
practice = ["You're a fuckinh imbeecileee, imbecileee, imBEsiLe you idiott dumb asssssss"]

In [27]:
practice_proce = processor_data.process(practice)
practice_proce

[' fuckinh imbeecilee imbecilee imbesile idiott dumb ass']

In [28]:
mispellings(incorrect_words_searching(practice_proce))

[('imbeecilee', 'replace', 'imbecile'),
 ('imbecilee', 'replace', 'imbecile'),
 ('idiott', 'replace', 'idiot'),
 ('imbesile', 'replace', 'imbecile')]

### Disclaimer: we consider that we did not do well in this assignment since we have not learned about NLP, so how to do the preprocessing was hard and personally we do not liket how we did it, besides, the problem with the regex to find companies and usernames is hard because it depends on how well people writes, and there are also all the possibles ways to write them, since in this dataset is not normalized how usernames are called neither companies. And also, for companies there are a lot of termination for companies and we do not know all of them.

### Disclaimer 2: We think that making the mispelling words to their canonical form, was quite a challenge and we did not did well because after reading we found that the autocorrector how they work is they have a self dictionary that is constantly learning more and more, and we could not do that :(, for two reasons we do not have time and sounds hard.