# Some Example Code to Work with Regular Expressions

First we build a mini-corpus to work with

Some helper functions from last week:

In [4]:
import os
import gzip
import json
from pathlib import Path

def save_data(texts, filename):
    id = 0
    while os.path.isfile(filename.with_name(filename.name + '_' + str(id)).with_suffix('.json.gz')):
        id += 1
    filename = filename.with_name(filename.name + '_' + str(id)).with_suffix('.json.gz')
    with gzip.GzipFile(filename, 'w') as data_file:
        data_file.write(json.dumps(texts).encode('utf-8'))
    return filename


def load_data(filename):
    return json.loads(gzip.GzipFile(filename).read().decode('utf-8'))

You can skip the following cell and instead read your own corpus!

In [10]:
import requests
import re


def getWikiText(title):
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            #'pageids': page_id,
            'titles': title,
            'prop': 'extracts',
            'explaintext': True
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    if page != None and 'extract' in page:
        return page['extract']
    else:
        return ''


def getWikiPages(category):
    titles = []
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': 'Category:'+category,
            'cmtype':'page',
            'format': 'json'
        }
    ).json()

    while(response):
        titles.extend( [t['title'] for t in response['query']['categorymembers']] )
        if 'continue' in response:
            cont = response['continue']['cmcontinue']
            response = requests.get(
                'https://en.wikipedia.org/w/api.php',
                params={
                    'action': 'query',
                    'list': 'categorymembers',
                    'cmtitle': 'Category:' + category,
                    'cmtype': 'page',
                    'cmcontinue': cont,
                    'format': 'json'
                }
            ).json()
        else:
            break

        if not cont:
            response = None

    return titles


textlist = []
h_mark = re.compile(r'==')
pages = getWikiPages('Machine learning')
pages.extend(getWikiPages('Applied machine learning'))
#pages.extend(getWikiPages('Statistical natural language processing'))
#pages.extend(getWikiPages('Machine learning algorithms'))
#pages.extend(getWikiPages('Artificial intelligence'))
#pages.extend(getWikiPages('Knowledge representation'))
for title in pages:
    text = getWikiText(title)
    if len(text) > 200:
        text = h_mark.sub('\n\n', text)
        textlist.append(text)
        
output_data_file_path = Path('.') / 'mycorpus'
save_data(textlist,output_data_file_path)        

WindowsPath('mycorpus_0.json.gz')

We read the data again:

In [11]:
 corpus_texts = load_data('mycorpus_0.json.gz')

# Try out some regex search

In order to search  pattern in the corpus we iterate over all texts and in each text over all strings. Instead of iterating over lines we also could do multiline search. For thsi some flags have to be set in the regular expression functions. We will sty wit the simple example and search in texts without linebreaks.

In [26]:
import re

for text_nr in range(len(corpus_texts)):
    text = corpus_texts[text_nr]
    lines = re.split('[\n\r]+',text)
    for line_nr in range(len(lines)):
        line = lines[line_nr]
        match = re.search('(over|under)fit\w*',line)
        if match:
             print(text_nr,line_nr,match.start(), match.group(0),sep='\t')

0	28	411	overfitting
0	116	405	overfitting
25	1	182	underfitting
25	2	204	overfitting
25	5	388	overfitting
25	2057	347	overfitting
32	338	14	overfitting
45	1	264	overfitting
45	182	402	overfitting
45	319	153	overfitting
45	324	110	overfitting
45	328	80	overfitting
45	391	71	overfitting
45	409	26	overfitting
45	413	39	overfitting
45	415	30	overfitting
45	502	156	overfitting
48	0	781	overfitting
48	174	286	overfitted
48	749	660	overfitting
65	0	78	overfitting
65	8	125	overfitting
65	134	203	overfit
65	465	59	overfitting
65	637	217	overfitting
65	642	426	overfitting
65	644	62	overfitting
77	16	346	overfitting
81	38	319	overfitting
86	614	344	overfitting
88	34	388	overfitting
90	9	177	overfitting
98	4	90	overfitting
103	1564	49	overfitting
110	739	65	overfitting
127	51	722	overfit
128	4	89	overfitting
139	2	1050	overfitting
146	674	35	overfit
158	0	15	overfitting
158	1	16	underfitting
158	2	19	overfitting
158	4	18	overfitting
158	5	39	overfitting
158	7	192	overfitting
158	8	355	overfitted


We still might miss some matches, since we only find the first occurrence on each line! So if we have a match we have to continue searching!

In [24]:
pattern = re.compile('(over|under)fit\w*')

for text_nr in range(len(corpus_texts)):
    text = corpus_texts[text_nr]
    lines = re.split('[\n\r]+',text)
    for line_nr in range(len(lines)):
        line = lines[line_nr]
        start = 0
        while start < len(line)-7: 
            match = pattern.search(line,start)
            if match:
                print(text_nr,line_nr,match.start(), match.group(0),sep='\t')
                start = match.end()
            else:
                start = len(line)

0	28	411	overfitting
0	116	405	overfitting
25	1	182	underfitting
25	2	204	overfitting
25	5	388	overfitting
25	5	535	overfit
25	5	551	underfit
25	2057	347	overfitting
25	2057	476	overfitting
32	338	14	overfitting
45	1	264	overfitting
45	182	402	overfitting
45	319	153	overfitting
45	324	110	overfitting
45	328	80	overfitting
45	328	114	overfitting
45	391	71	overfitting
45	409	26	overfitting
45	409	183	overfitting
45	413	39	overfitting
45	413	102	overfitting
45	415	30	overfitting
45	415	446	overfitting
45	502	156	overfitting
48	0	781	overfitting
48	174	286	overfitted
48	749	660	overfitting
65	0	78	overfitting
65	8	125	overfitting
65	134	203	overfit
65	465	59	overfitting
65	637	217	overfitting
65	642	426	overfitting
65	644	62	overfitting
77	16	346	overfitting
81	38	319	overfitting
81	38	392	overfitting
86	614	344	overfitting
88	34	388	overfitting
88	34	725	overfitting
90	9	177	overfitting
98	4	90	overfitting
103	1564	49	overfitting
110	739	65	overfitting
127	51	722	overfit
128	4	89	overfitt

Instead of printing we can also count the words:

In [19]:
from collections import Counter

w_freq = Counter()
pref_freq = Counter()

for text_nr in range(len(corpus_texts)):
    text = corpus_texts[text_nr]
    lines = re.split('[\n\r]+',text)
    for line_nr in range(len(lines)):
        line = lines[line_nr]
        start = 0
        while start < len(line)-7: 
            match = pattern.search(line,start)
            if match:
                w_freq.update([match.group(0)])
                pref_freq.update([match.group(1)])
                start = match.end()
            else:
                start = len(line)
                
print(w_freq.most_common())
print(pref_freq.most_common())

[('overfitting', 72), ('overfit', 9), ('overfitted', 6), ('underfitting', 3), ('underfitted', 2), ('underfit', 1), ('overfits', 1)]
[('over', 88), ('under', 6)]


## A second example

In [23]:
pattern = re.compile('(a|an|the) (difficult|easy|simple) (\w+)')

w_freq = Counter()

for text_nr in range(len(corpus_texts)):
    text = corpus_texts[text_nr]
    lines = re.split('[\n\r]+',text)
    for line_nr in range(len(lines)):
        line = lines[line_nr]
        start = 0
        while start < len(line)-7: 
            match = pattern.search(line,start)
            if match:
                w_freq.update([match.group(3)])
                start = match.end()
            else:
                start = len(line)
                
print(w_freq.most_common(5))

[('example', 5), ('problem', 2), ('algorithm', 2), ('function', 2), ('and', 2)]
