In [98]:
from pprint import pprint

import numpy as np
import pandas as pd
#import sax
import re
from datetime import datetime
import locale
import pickle

#E23W3-26JM4-GGBDB

## Matching Regex and Formats
### Intro

The basic task is 

1) to match several regex on a document and insure the order of the matches and 

2) depending on which regex is matched to process the result differently. 

Think about it like having  a several documents from which you want to retrieve the first and last date and subsequently convert it to a datetime object. So on the one hand the order of the matches is important, and on the other hand we want to keep the mapping to their format in tact. Assuming the dates appear in chronological order in the document, we can determine the first and last date after transforming them, but I would also like to solve the if there is no natural order on the transformed objects or we want to avoid transforming all of them. 

### A very simple idea

The easiest thing to do would be to create a bunch of patterns and if one matches we  translate it to a datetime object and store it:


In [5]:

ds1 = '16:45, Jan 03, 2007 (UTC) blabla'
ds2 = '16:45, 03 Jan 2007 (UTC)'
ds3 = '16:45, 03 January 2007 (UTC)'

p1 = '([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)'
p2 = '([\d]{2}:[\d]{2}, [\d]{1,2} [a-zA-Z]{3} [\d]{4}) \(UTC\)'
p3 = '([\d]{2}:[\d]{2}, [\d]{1,2} [A-Za-z]{3,8} [\d]{4}) \(UTC\)' 
p4 = "[\d]{2}:[\d]{2}, [A-Z][a-z]{3,8} [\d]{1,2}, [\d]{4} \(UTC\)"
p5 = '[A-Z][a-z]{3,8} [\\d]{1,2}, [\d]{4} [\d]{2}:[\d]{2} \(UTC\)'

f1 = ('%H:%M, %b %d, %Y')
f2 = ('%H:%M, %d %b %Y')

In [6]:
data = [
'16:45, Jan 03, 2007 (UTC) blabla',
'16:45, 03 Jan 2007 (UTC)'
]

d = {
    '([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)' : '%H:%M, %b %d, %Y',
    '([\d]{2}:[\d]{2}, [\d]{1,2} [a-zA-Z]{3} [\d]{4}) \(UTC\)' : '%H:%M, %d %b %Y'
}

In [7]:
content = 'lore ipsum 16:45, Jan 03, 2007 (UTC) ipsum blabla' \
        '16:45, Feb 05, 2007 (UTC) blabla'

    def simple_match_and_format(content):

        p1 = '([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)'
        p2 = '[A-Z][a-z]{3,8} [\\d]{1,2}, [\d]{4} [\d]{2}:[\d]{2} \(UTC\)'

        m1 = re.findall(p1, content)
        m2 = re.findall(p2, content)

        dates = []
        for match in m1:
            dates.append(datetime.strptime(m1[0], '%H:%M, %b %d, %Y'))
        for match in m2:
            dates.append( datetime.strptime(m1[-1], '%H:%M, %b %d, %Y'))
        
        return dates

print(simple_match_and_format(content))

['16:45, Jan 03, 2007', '16:45, Feb 05, 2007']


### using a dictionary

Now this certainly does not look very elegant. Furthermore we then have to determine the first and last date of all maches. But this should be avoidable and does not solve the problem if the processed result have no natural order. 

So I remember my favorite Python rule of thumb: If you find yourself writing a lot of if statements, it’s most likely a dictionary is the solution. And I think its intuitive, since a this is what we are looking for a mapping from a regex pattern to a date format. Now we can iterate over the dictionary keys and if we get a match we can use it to retrieve the corresponding date format. While this looks much more elegant, but we are still left with the problem which was the first date and which was the last one, since the regex are checked in the order in which the are returned from the dictionary.

In [9]:
def match_and_format_join(content, format_dict):
    res = []
    regex_comb = '|'.join(format_dict.keys())
    matches=re.findall(regex_comb, content)
    for match in  matches:
        res.append(datetime.strptime(matches))
    # map matching part of regex_comb to its format

Intead of findall() we can also you finditer() to a match object. Which allow for a more flexible handling of the groups. Additionaly if I understand correctly the match object only store a possitional match and return the acctual matched strings not until, gorups() is called. This might be good if only some of the results are processed. Since we strored a possitional match, we can also use this to sort our iterators if we want to ensure the order of exectution. 

This is the base loop to get the matches:

In [9]:
matches = []

def match_and_format_loop(cotent, format_dict):
    for regex in format_dict.keys():
        for match in re.finThere are 3 additional filesditer(regex, content):
            matches.append((match, regex))

    for match_i, regex_i in matches:
        print(match_i.groups(), regex_i)
        print(format_dict[regex_i], '\n')

match_and_format_loop(content, date_formats)


('16:45, Jan 03, 2007',) ([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)
%H:%M, %b %d, %Y 

('16:45, Feb 05, 2007',) ([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)
%H:%M, %b %d, %Y 



We can write  this as an list comprehetion. Now we have to process our matches acording to the format recieved from the format_dict. Wo can also make use of our positional matches and sort our iterators before we process the result.

In [1]:
content = 'bla bla bla 16:45, Jan 03, 2007 (UTC) blabla' \
          '16:45, 13 May 2018 (UTC)' \
          '16:45, Feb 05, 1997 (UTC) blabla'\
          '16:45, Jan 05, 2007 (UTC) blabla'

date_formats = {
    '([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)': '%H:%M, %b %d, %Y',
    '([A-Z][a-z]{3,8} [\\d]{1,2}, [\d]{4} [\d]{2}:[\d]{2}) \(UTC\)': '%B %d, %Y %H:%M'
}

def match_and_format(content, format_dict, ensure_order=True, verbose=True):

    matches = [(match.start(), match, regex) for regex in format_dict.keys() 
               for match in re.finditer(regex, content)]
    print(matches)
    if ensure_order:
        matches.sort(key= lambda x: x[0])
    
    print(len(matches), ' matches:')
    res = []
    for _, match_i, regex_i in matches:
        res.append(datetime.strptime(match_i.group(1), format_dict[regex_i]))
        
        if verbose:
            print(match_i.group(1), regex_i)
            print(format_dict[regex_], '\n')

    return res

match_and_format(content, date_formats)


NameError: name 're' is not defined

So far I have explixitly written the fuction I used for formating our match in our processing loop, but what we can also use the dictionary to supply arbitrary functions. So I defined a couple of simple mathematical function blow. Inside the code of a predifined function we put the content of format_dict inside the mapping variable. 

But now we are left with a new problem, what if we have predifined method parameters, like the C-Date Format for the strptime function? The way we have built the it we would have to write a function for each different date format. That's not what I meant with I want do make  this nice and beautiful. So some more tinkering. 

In [44]:
from math import exp
def my_add(x, y):
    return int(x) + int(y)

def my_mult(x,y):
    return int(x) * int(y)

def my_exp(x):
    return exp(int(x))

content = '1+2 15+10 e(1)'\
    'bla bla bla 16:45, Jan 03, 2007 (UTC) blabla'

math_dict = {'([\d]{1,3})\+([\d]{1,3})' : my_add,
              '([\d]{1,3})\*([\d]{1,3})' : my_mult,
              'e{1}\(([\d]{1,3})\)': my_exp}


def match_and_format_adv(content, format_dict, ensure_order=True, verbose=True):

    matches = [(match.start(), match, regex) for regex in format_dict.keys() for match in re.finditer(regex, content)]
    if ensure_order:
        matches.sort(key= lambda x: x[0])
        
    print(len(matches), ' matches:')
    res = []
    for _, match_i, regex_i in matches:
        mapping = format_dict[regex_i]
        res.append(mapping(*match_i.groups()))
        
        if verbose:
            print(match_i.groups(), regex_i)       
            print(format_dict[regex_i])
            print(res, '\n')
    return res

print(match_and_format_adv(content, math_dict))


3  matches:
('1', '2') ([\d]{1,3})\+([\d]{1,3})
<function my_add at 0x7f832bf3f510>
[3] 

('15', '10') ([\d]{1,3})\+([\d]{1,3})
<function my_add at 0x7f832bf3f510>
[3, 25] 

('1',) e{1}\(([\d]{1,3})\)
<function my_exp at 0x7f832b694d08>
[3, 25, 2.718281828459045] 

[3, 25, 2.718281828459045]


In [45]:
def match_and_format_sortable(content, format_dict, ensure_order=True ,verbose=True):

    matches = [(match.start(), match, regex) for regex in format_dict.keys() 
               for match in re.finditer(regex, content)]
    if ensure_order:
        matches.sort(key= lambda x: x[0])
    
    print(len(matches), ' matches:')
    res = []
    for _, match_i, regex_i in matches:
        mapping = format_dict[regex_i]
        res.append(mapping(*match_i.groups()))
        
        if verbose:
            print(match_i.groups(), regex_i)       
            print(format_dict[regex_i])
            print(res, '\n')            
    return res

print(match_and_format_adv(content, math_dict))

3  matches:
('1', '2') ([\d]{1,3})\+([\d]{1,3})
<function my_add at 0x7f832bf3f510>
[3] 

('15', '10') ([\d]{1,3})\+([\d]{1,3})
<function my_add at 0x7f832bf3f510>
[3, 25] 

('1',) e{1}\(([\d]{1,3})\)
<function my_exp at 0x7f832b694d08>
[3, 25, 2.718281828459045] 

[3, 25, 2.718281828459045]







So if we want supply parameters to our function we will have to store them with the function. Now there are basically 3 cases that we have to be able to handle. 

1) we only want to return the content of our match

2) we want to supply the content of our match to a function 

3) we want to supply the content of our match + some predefined parameters to a function

How we archive 2) we have already figured out. 1) we have ignored so far. So let's look a 3). 

So I figured what I need is a general wrapper for my functions. You can see that it is called with the matched strings first and then with the predefined parameters second. So as long as we can ensure, that's the order our function expects its inputs, we can ensure it to be exectued correctly.


Now what else could I optimize. I wonder if there is a better way of handeling the type conversions. Because there is still a case that my solution can't handle nicely. Everytime a function needs input differnet that string, I have to write a new function. Anybody could think about how to get around this?
The second thing I thought would be nice is to be able to select what data is actually handled. 

In [10]:
content = '1+2 15+10 15**2' \
   'bla bla bla 16:45, Jan 03, 2007 (UTC) blabla' \
          '16:45, 13 May 2018 (UTC)' \
          '16:45, Feb 05, 1997 (UTC) blabla'\
          '16:45, Jan 05, 2007 (UTC) blabla'\
        'START'


def my_wrapper(func, groups, param):
    print(f'Function: {func} \n Groups: {groups} \n Parameters: {param}')
    if func:
        return func(*groups, *param)
    else:
        return groups

def my_add(x,y):
    return int(x) + int(y)

wrapper_formats = {
    '([\d]{2}:[\d]{2}, [a-zA-Z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)': 
    (datetime.strptime, '%H:%M, %b %d, %Y'),
    '([A-Z][a-z]{3,8} [\\d]{1,2}, [\d]{4} [\d]{2}:[\d]{2}) \(UTC\)': 
    (datetime.strptime, '%B %d, %Y %H:%M'),
    '(\d{1,3})\+(\d{1,3})' : 
    (my_add, ),
    '(START)' : (None,)
}

def match_and_format_func(content, format_dict, ensure_order=True ,verbose=True):
    
    matches = [(match.start, match, regex) for regex in format_dict.keys() 
               for match in re.finditer(regex, content)]
    
    
    res = []
    for _, match_i, regex_i in matches:
            func = format_dict[regex_i][0]
            groups = match_i.groups()
            param = format_dict[regex_i][1:]
            tmp = my_wrapper(func, groups, param )
            res.append(tmp)
    
    #res = [my_wrapper(format_dict[regex_i][0], match_i.groups() , format_dict[regex_i][1:]) 
           # for _, match_i, regex_i in matches]    
    re
    return res

res = match_and_format_func(content, wrapper_formats, False, False)
pprint(res)

Function: <built-in method strptime of type object at 0x7f6e749bf900> 
 Groups: ('16:45, Jan 03, 2007',) 
 Parameters: ('%H:%M, %b %d, %Y',)
Function: <built-in method strptime of type object at 0x7f6e749bf900> 
 Groups: ('16:45, Feb 05, 1997',) 
 Parameters: ('%H:%M, %b %d, %Y',)
Function: <built-in method strptime of type object at 0x7f6e749bf900> 
 Groups: ('16:45, Jan 05, 2007',) 
 Parameters: ('%H:%M, %b %d, %Y',)
Function: <function my_add at 0x7f6e422c9ae8> 
 Groups: ('1', '2') 
 Parameters: ()
Function: <function my_add at 0x7f6e422c9ae8> 
 Groups: ('15', '10') 
 Parameters: ()
Function: None 
 Groups: ('START',) 
 Parameters: ()
[datetime.datetime(2007, 1, 3, 16, 45),
 datetime.datetime(1997, 2, 5, 16, 45),
 datetime.datetime(2007, 1, 5, 16, 45),
 3,
 25,
 ('START',)]
