In [3]:
# regular expressions
# open the nytimes txt file
with open("nytimes.txt","r",encoding="utf8") as story:
    storyText = story.read() # reads everything in, newlines and all into storyText
# review: after with open ... block, the file automatically closes
print(storyText)
# what kind of datatype is storyText? it's a string.  one big string.

FileNotFoundError: [Errno 2] No such file or directory: 'nytimes.txt'

In [1]:
# how many characters are in the story?
len(storyText)

NameError: name 'storyText' is not defined

In [3]:
# approximately how many words are in storyText?
len(storyText.split())

1385

In [4]:
# what is the average length of each word?
len(storyText)/len(storyText.split())

6.325631768953069

In [5]:
# a third of a word doesn't make sense, so let's round
round(len(storyText)/len(storyText.split()))

6

In [6]:
len(storyText)//len(storyText.split())

6

In [7]:
# let's get into the wonderful, wonderful world of regular expressions
# regular expressions find and extract patterns in textual data 
# the word regex is sometimes used as an abbreviation for regular expression
# the thing we're looking for in text is often called a regex pattern
import re

In [8]:
re.findall("president",storyText)
# syntax: re.findall(regexpattern,stringYouAreLookingIn)
# re.findall(needle,haystack)

['president', 'president', 'president', 'president', 'president']

In [9]:
# regex really shines when you use regex patterns to match multiple things
# regex patterns are built with regex symbols
# \d matches any decimal digit (0-9)

re.findall("\d",storyText)

['2',
 '0',
 '0',
 '1',
 '5',
 '7',
 '0',
 '5',
 '2',
 '0',
 '0',
 '1',
 '0',
 '2',
 '0',
 '0',
 '1',
 '8',
 '0',
 '0',
 '1',
 '9',
 '0',
 '2',
 '2',
 '0',
 '1',
 '1',
 '0',
 '1',
 '5',
 '2',
 '0',
 '5',
 '5',
 '1',
 '0',
 '2',
 '0',
 '0',
 '1',
 '5',
 '1',
 '2',
 '0',
 '1',
 '7']

In [10]:
# \D matches any non-digit character
# \s matches any whitespace character
# \S matches any non-whitespace character
# \w matches any alphanumeric character
# \W matches any non-alphanumeric character
# \t matches tab
# \n matches newline
# . matches any character except the newline character

In [11]:
# these are useful but not to match one character at a time
# what if we wanted to match two-digit numbers?
re.findall("\d\d",storyText)

['20',
 '70',
 '20',
 '10',
 '20',
 '18',
 '00',
 '19',
 '02',
 '20',
 '11',
 '20',
 '55',
 '10',
 '20',
 '20',
 '17']

In [12]:
myString = "unicorns20bearslions44"
re.findall("\d\d",myString)

['20', '44']

In [13]:
# three-digit number?
re.findall("\d\d\d",storyText)

['200', '200', '200', '180', '190', '110', '200', '201']

In [14]:
re.findall("\d\d\d",myString)

[]

In [15]:
# what if we wanted to say any number of digits?
# + symbol is a wildcard multiplier
# it matches one or more of the previous regex symbol
re.findall("\d+",storyText) # this gets numbers of any length

['200',
 '1',
 '5',
 '70',
 '5',
 '200',
 '10',
 '200',
 '1800',
 '1902',
 '20',
 '110',
 '1',
 '5',
 '20',
 '55',
 '10',
 '200',
 '1',
 '5',
 '1',
 '2017']

In [16]:
# what will this pattern do?
re.findall("\w+",storyText)
# this matches strings of any length that contain alphanumeric characters only

['President',
 'Trump',
 's',
 '200',
 'billion',
 'plan',
 'to',
 'rebuild',
 'America',
 'upends',
 'the',
 'criteria',
 'that',
 'have',
 'long',
 'been',
 'used',
 'to',
 'pick',
 'ambitious',
 'federal',
 'projects',
 'putting',
 'little',
 'emphasis',
 'on',
 'how',
 'much',
 'an',
 'infrastructure',
 'proposal',
 'benefits',
 'the',
 'public',
 'and',
 'more',
 'on',
 'finding',
 'private',
 'investors',
 'and',
 'other',
 'outside',
 'sources',
 'of',
 'money',
 'Unveiled',
 'on',
 'Monday',
 'the',
 'infrastructure',
 'program',
 'that',
 'Mr',
 'Trump',
 'has',
 'championed',
 'since',
 'the',
 'campaign',
 'is',
 'intended',
 'to',
 'attract',
 'a',
 'huge',
 'amount',
 'of',
 'additional',
 'money',
 'from',
 'states',
 'localities',
 'and',
 'private',
 'investors',
 'The',
 'goal',
 'is',
 'to',
 'generate',
 'a',
 'total',
 'pot',
 'of',
 '1',
 '5',
 'trillion',
 'to',
 'upgrade',
 'the',
 'country',
 's',
 'highways',
 'airports',
 'and',
 'railroads',
 'Those',
 'finan

In [17]:
# let's match only dollar amounts in the story
re.findall("\$",storyText) # we put a slash in front of the $ because the $ sign means something
# to regular expressions. (in case you're wondering, it means the end of a string just
# before the newline)

['$', '$', '$', '$', '$', '$', '$', '$', '$']

In [18]:
# now let's get all the digits
re.findall("\$\d+",storyText)
# you're essentially concatenating string "pieces" here. \$\d+ is saying "find dollar symbols
# follwed by any number of digits

['$200', '$1', '$200', '$200', '$1', '$20', '$55', '$200', '$1']

In [19]:
# let's get crazy
# match a dollar amount followed by the associated amount 
# (e.g., millions, trillions, et cetera)
# how do we get something like "$200 billion"
# first part: \$\d+
# after the first part: whitespace
# after the whitespace: alpha characters
# let's cheat a little and just say alphanumeric
re.findall("\$\d+ \w+",storyText)

['$200 billion',
 '$200 billion',
 '$200 billion',
 '$20 billion',
 '$55 billion',
 '$200 billion']

In [20]:
# we're still missing numbers with decimal points
# \S matches any non-whitespace character
re.findall("\$\S+ \w+",storyText)
# we're using \S+ to get one or more non-whitespace characters after a $ sign

['$200 billion',
 '$1.5 trillion',
 '$200 billion',
 '$200 billion',
 '$1.5 trillion',
 '$20 billion',
 '$55 billion',
 '$200 billion',
 '$1.5 trillion']

In [21]:
# what if we only want to hear about bigly amounts of money?
# there's a symbol called a negative lookahead. it is 
# (?!)
# it means "don't match the pattern after ?!"
# (?!myPattern) - this will ignore any matches that include myPattern
re.findall("\$\S+ (?!billion)\w+",storyText)

['$1.5 trillion', '$1.5 trillion', '$1.5 trillion']

In [22]:
re.findall("\$\S+ trillion",storyText)

['$1.5 trillion', '$1.5 trillion', '$1.5 trillion']

In [23]:
# you might be thinking: let's just write for loops!
allWords = storyText.split()
matches = []
for word in allWords:
    if word[0] == "$":
        matches.append(word)
print(matches)

['$200', '$1.5', '$200', '$200', '$1.5', '$20', '$55', '$200', '$1.5']


In [24]:
# what about matching every word that starts with a vowel?
# what's a vowel? a e i o u A E I O U
# we need a way to say or in our regular expressions
# square brackets do this for us
# [aeiouAEIOU] will match one of those characters inside the []
re.findall("[aeiouAEIOU]\w+",storyText)

['esident',
 'ump',
 'illion',
 'an',
 'ebuild',
 'America',
 'upends',
 'iteria',
 'at',
 'ave',
 'ong',
 'een',
 'used',
 'ick',
 'ambitious',
 'ederal',
 'ojects',
 'utting',
 'ittle',
 'emphasis',
 'on',
 'ow',
 'uch',
 'an',
 'infrastructure',
 'oposal',
 'enefits',
 'ublic',
 'and',
 'ore',
 'on',
 'inding',
 'ivate',
 'investors',
 'and',
 'other',
 'outside',
 'ources',
 'of',
 'oney',
 'Unveiled',
 'on',
 'onday',
 'infrastructure',
 'ogram',
 'at',
 'ump',
 'as',
 'ampioned',
 'ince',
 'ampaign',
 'is',
 'intended',
 'attract',
 'uge',
 'amount',
 'of',
 'additional',
 'oney',
 'om',
 'ates',
 'ocalities',
 'and',
 'ivate',
 'investors',
 'oal',
 'is',
 'enerate',
 'otal',
 'ot',
 'of',
 'illion',
 'upgrade',
 'ountry',
 'ighways',
 'airports',
 'and',
 'ailroads',
 'ose',
 'inancial',
 'iorities',
 'are',
 'allized',
 'in',
 'ew',
 'uidelines',
 'established',
 'ite',
 'ouse',
 'ability',
 'ind',
 'ources',
 'of',
 'unding',
 'outside',
 'ederal',
 'overnment',
 'ill',
 'ost

In [25]:
# decent start.  how can we say "Beginning of a word?"
# \\b says "beginning of a word"
# we use two slashes because \b means something
re.findall("\\b[aeiouAEIOU]\w+",storyText)
# more specifically, \\b is a word boundary, so you can use it at the beginning of a word,
# the end of a word, or both!

['America',
 'upends',
 'used',
 'ambitious',
 'emphasis',
 'on',
 'an',
 'infrastructure',
 'and',
 'on',
 'investors',
 'and',
 'other',
 'outside',
 'of',
 'Unveiled',
 'on',
 'infrastructure',
 'is',
 'intended',
 'attract',
 'amount',
 'of',
 'additional',
 'and',
 'investors',
 'is',
 'of',
 'upgrade',
 'airports',
 'and',
 'are',
 'in',
 'established',
 'ability',
 'of',
 'outside',
 'important',
 'accounting',
 'of',
 'infrastructure',
 'economic',
 'and',
 'on',
 'investment',
 'at',
 'at',
 'In',
 'access',
 'in',
 'investors',
 'of',
 'overhaul',
 'approved',
 'Instead',
 'of',
 'on',
 'and',
 'are',
 'attractive',
 'investors',
 'are',
 'ones',
 'of',
 'Elliott',
 'of',
 'urban',
 'and',
 'international',
 'affairs',
 'at',
 'University',
 'investors',
 'intended',
 'impoverished',
 'and',
 'including',
 'improving',
 'in',
 'investors',
 'in',
 'obligation',
 'is',
 'as',
 'in',
 'infrastructure',
 'of',
 'over',
 'used',
 'incentives',
 'even',
 'and',
 'also',
 'up',
 'a

In [26]:
# we can do ranges of things inside brackets which is really useful
# we can use ranges with letters
re.findall("\\b[a-zA-Z]+\\b",storyText)

['President',
 'Trump',
 's',
 'billion',
 'plan',
 'to',
 'rebuild',
 'America',
 'upends',
 'the',
 'criteria',
 'that',
 'have',
 'long',
 'been',
 'used',
 'to',
 'pick',
 'ambitious',
 'federal',
 'projects',
 'putting',
 'little',
 'emphasis',
 'on',
 'how',
 'much',
 'an',
 'infrastructure',
 'proposal',
 'benefits',
 'the',
 'public',
 'and',
 'more',
 'on',
 'finding',
 'private',
 'investors',
 'and',
 'other',
 'outside',
 'sources',
 'of',
 'money',
 'Unveiled',
 'on',
 'Monday',
 'the',
 'infrastructure',
 'program',
 'that',
 'Mr',
 'Trump',
 'has',
 'championed',
 'since',
 'the',
 'campaign',
 'is',
 'intended',
 'to',
 'attract',
 'a',
 'huge',
 'amount',
 'of',
 'additional',
 'money',
 'from',
 'states',
 'localities',
 'and',
 'private',
 'investors',
 'The',
 'goal',
 'is',
 'to',
 'generate',
 'a',
 'total',
 'pot',
 'of',
 'trillion',
 'to',
 'upgrade',
 'the',
 'country',
 's',
 'highways',
 'airports',
 'and',
 'railroads',
 'Those',
 'financial',
 'priorities'

In [28]:
# pretty good at getting all the words, but still having issues with possession (')
# we'll get back to the possession/punctuation issues

# four-digit numbers
re.findall("\d\d\d\d",storyText)

['1800', '1902', '2017']

In [29]:
# what about 10 digit numbers?
# instead of writing \d as many times as you need numbers, use the following notation
re.findall("\d{4}",storyText)
# symbol: {numTimesPrevSymbolOccurs}

['1800', '1902', '2017']

In [30]:
# we can specify a range with this notation
re.findall("\d{2,4}",storyText)
# this matches up to 4 digits and as low as 2 digits

['200',
 '70',
 '200',
 '10',
 '200',
 '1800',
 '1902',
 '20',
 '110',
 '20',
 '55',
 '10',
 '200',
 '2017']

In [35]:
# let's get back to our problem with apostrophes
# the syntax we just learned can help with this
re.findall("\\b[a-zA-Z]{2,}\\b",storyText)
# specify the start of a range but not the end will match at least that number of symbols
# but no upper bound is specified
# note: the range ONLY affects the preceding symbol
# here we're saying "two or more"

['President',
 'Trump',
 'billion',
 'plan',
 'to',
 'rebuild',
 'America',
 'upends',
 'the',
 'criteria',
 'that',
 'have',
 'long',
 'been',
 'used',
 'to',
 'pick',
 'ambitious',
 'federal',
 'projects',
 'putting',
 'little',
 'emphasis',
 'on',
 'how',
 'much',
 'an',
 'infrastructure',
 'proposal',
 'benefits',
 'the',
 'public',
 'and',
 'more',
 'on',
 'finding',
 'private',
 'investors',
 'and',
 'other',
 'outside',
 'sources',
 'of',
 'money',
 'Unveiled',
 'on',
 'Monday',
 'the',
 'infrastructure',
 'program',
 'that',
 'Mr',
 'Trump',
 'has',
 'championed',
 'since',
 'the',
 'campaign',
 'is',
 'intended',
 'to',
 'attract',
 'huge',
 'amount',
 'of',
 'additional',
 'money',
 'from',
 'states',
 'localities',
 'and',
 'private',
 'investors',
 'The',
 'goal',
 'is',
 'to',
 'generate',
 'total',
 'pot',
 'of',
 'trillion',
 'to',
 'upgrade',
 'the',
 'country',
 'highways',
 'airports',
 'and',
 'railroads',
 'Those',
 'financial',
 'priorities',
 'are',
 'crystallized

In [36]:
re.findall("\\b[a-zA-Z]{2,}\\b","we're")

['we', 're']

In [37]:
# how many times is the word 'Ms.' in the text?  what about 'Mr.'?
re.findall("Ms.",storyText)

['Ms.']

In [38]:
re.findall("Mr.",storyText)

['Mr.', 'Mr.', 'Mr.', 'Mr.', 'Mr.', 'Mr.', 'Mr.', 'Mr.']

In [44]:
# how do we find their full names?
re.findall("Ms. \w+",storyText)

['Ms. Knopman']

In [45]:
re.findall("Mr. \w+",storyText)

['Mr. Trump',
 'Mr. Sclar',
 'Mr. Trump',
 'Mr. Trump',
 'Mr. Damschen',
 'Mr. Klepper',
 'Mr. Trump',
 'Mr. Klepper']

In [47]:
# the words surrounding and including the word 'federal'?
re.findall("\w+ federal \w+",storyText)

['ambitious federal projects',
 'the federal government',
 'for federal funds',
 'the federal government',
 'gives federal agencies',
 'in federal funding',
 'for federal involvement',
 'of federal support',
 'additional federal funding',
 'of federal funds',
 'the federal government',
 'reducing federal infrastructure',
 'using federal credit',
 'the federal government']

In [48]:
# the last word in a sentence?
# what comes at the end of a sentence? a period.
re.findall("\w+\.",storyText)

['money.',
 'Mr.',
 'investors.',
 '1.',
 'railroads.',
 'House.',
 'projects.',
 'percent.',
 'light.',
 'approved.',
 'University.',
 'returns.',
 'Mich.',
 'shrift.',
 'project.',
 'profit.',
 'Mr.',
 'said.',
 'projects.',
 'sector.',
 'Mr.',
 'process.',
 'value.',
 'owners.',
 'easy.',
 'cuts.',
 'Mr.',
 'plan.',
 'role.',
 'navigation.',
 'States.',
 'Corporation.',
 'point.',
 'Act.',
 'Ms.',
 'States.',
 'projects.',
 'unknowns.',
 'funds.',
 'need.',
 'ports.',
 'welcome.',
 'Officers.',
 'improvement.',
 'projects.',
 '1.',
 'year.',
 'limited.',
 'projects.',
 'laws.',
 'system.',
 'declining.',
 'Mr.',
 'state.',
 'said.',
 'hoped.',
 'country.',
 'supplying.',
 'plan.',
 'guarantees.',
 'reached.',
 '1.',
 'said.',
 'untested.',
 'new.',
 'standards.',
 'criteria.',
 'No.',
 'second.',
 'Bureau.',
 'issue.',
 'Mr.',
 'administration.',
 'Mr.',
 'plan.',
 '2017.',
 'Mr.',
 'said.',
 'government.']

In [49]:
# what about ! and ?
re.findall("\w+[\.\?\!]",storyText)

['money.',
 'Mr.',
 'investors.',
 '1.',
 'railroads.',
 'House.',
 'projects.',
 'percent.',
 'light.',
 'approved.',
 'University.',
 'returns.',
 'Mich.',
 'shrift.',
 'project.',
 'profit.',
 'Mr.',
 'said.',
 'projects.',
 'sector.',
 'Mr.',
 'process.',
 'value.',
 'owners.',
 'easy.',
 'cuts.',
 'Mr.',
 'plan.',
 'role.',
 'navigation.',
 'States.',
 'Corporation.',
 'point.',
 'Act.',
 'Ms.',
 'States.',
 'projects.',
 'unknowns.',
 'funds.',
 'need.',
 'ports.',
 'welcome.',
 'Officers.',
 'improvement.',
 'projects.',
 '1.',
 'year.',
 'limited.',
 'projects.',
 'laws.',
 'system.',
 'declining.',
 'Mr.',
 'state.',
 'said.',
 'hoped.',
 'country.',
 'supplying.',
 'plan.',
 'guarantees.',
 'reached.',
 '1.',
 'said.',
 'untested.',
 'new.',
 'standards.',
 'criteria.',
 'No.',
 'second.',
 'Bureau.',
 'issue.',
 'Mr.',
 'administration.',
 'Mr.',
 'plan.',
 '2017.',
 'Mr.',
 'said.',
 'government.']

In [51]:
len(re.findall("\w+\.",storyText))

79

In [53]:
len(re.findall("\w+[\.\?\!]",storyText))
# looks like ever sentence in this story ends with a .

79

In [55]:
# + matches one or more of the previous symbol
# * matches zero or more of the previous symbol

# let's get back to our apostrophes
# let's do possession but skip contractions
re.findall("\\b[a-zA-Z]+'",storyText)
# single quote (') isn't the character used in the text file
# the actual character is ’
# you can copy/paste from your source material
# on a mac it is option shift right square bracket: ’

[]

In [61]:
wordsList = re.findall("\\b[a-zA-Z]+’*\w*",storyText)
# asterisk specifies 0 or more, so we're saying 0 or more apostrophes
# and then 0 or more alphanumeric characters after the apostrophe
# findall returns a list of matches, which we store in the variable wordsList
# so we can use it below

In [62]:
wordsList.count("they’ll")

1

In [64]:
re.findall("\\b[a-zA-Z]+’*\w*\\b",storyText)

['President',
 'Trump’s',
 'billion',
 'plan',
 'to',
 'rebuild',
 'America',
 'upends',
 'the',
 'criteria',
 'that',
 'have',
 'long',
 'been',
 'used',
 'to',
 'pick',
 'ambitious',
 'federal',
 'projects',
 'putting',
 'little',
 'emphasis',
 'on',
 'how',
 'much',
 'an',
 'infrastructure',
 'proposal',
 'benefits',
 'the',
 'public',
 'and',
 'more',
 'on',
 'finding',
 'private',
 'investors',
 'and',
 'other',
 'outside',
 'sources',
 'of',
 'money',
 'Unveiled',
 'on',
 'Monday',
 'the',
 'infrastructure',
 'program',
 'that',
 'Mr',
 'Trump',
 'has',
 'championed',
 'since',
 'the',
 'campaign',
 'is',
 'intended',
 'to',
 'attract',
 'a',
 'huge',
 'amount',
 'of',
 'additional',
 'money',
 'from',
 'states',
 'localities',
 'and',
 'private',
 'investors',
 'The',
 'goal',
 'is',
 'to',
 'generate',
 'a',
 'total',
 'pot',
 'of',
 'trillion',
 'to',
 'upgrade',
 'the',
 'country’s',
 'highways',
 'airports',
 'and',
 'railroads',
 'Those',
 'financial',
 'priorities',
 'are'