In [2]:
# String Object Methods

#  In many string munging and scripting applications, built-in string methods are suffi
# cient. As an example, a comma-separated string can be broken into pieces with split

import pandas as pd
import numpy as np

val='hero,villain ,  side'
val.split(',')

['hero', 'villain ', '  side']

In [3]:
# split is often combined with strip to trim whitespace (including newlines)

pieces=[x.strip() for x in val.split(',')]
pieces

['hero', 'villain', 'side']

In [4]:
#  These substrings could be concatenated together with a two-colon delimiter using ad
# dition:
a,b,c=pieces
print(a+'::'+b+"::"+c)

#  But, this isn’t a practical generic method. A faster and more Pythonic way is to pass a
#  list or tuple to the join method on the string '::':

'::'.join(pieces)

hero::villain::side


'hero::villain::side'

In [5]:
#  Other methods are concerned with locating substrings. Using Python’s in keyword is
#  the best way to detect a substring, though index and find can also be used

print('side' in val)

print(val.index(','))

print(val.find(':'))

True
4
-1


In [6]:
# count returns the number of occurrences of a particular substring
val.count(',')

2

In [7]:
# replace will substitute occurrences of one pattern for another. This is commonly used
#  to delete patterns, too, by passing an empty string
val.replace(',',',,')

'hero,,villain ,,  side'

In [9]:
# Regular expressions

#  The re module functions fall into three categories: pattern matching, substitution, and
#  splitting. Naturally these are all related; a regex describes a pattern to locate in the text,
#  which can then be used for many purposes. Let’s look at a simple example: suppose I
#  wanted to split a string with a variable number of whitespace characters (tabs, spaces,
#  and newlines). The regex describing one or more whitespace characters is \s+

import re
text="foo    bar\t baz  \tqux"
re.split('\s+',text)

# When you call re.split('\s+', text), the regular expression is first compiled, then its
#  split method is called on the passed text.

['foo', 'bar', 'baz', 'qux']

In [10]:
# You can compile the regex yourself with 
# re.compile, forming a reusable regex object:
regex=re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [12]:
# If, instead, you wanted to get a list of all patterns matching the regex, you can use the 
# findall method:
regex.findall(text)

['    ', '\t ', '  \t']

In [16]:
# Creating a regex object with re.compile is highly recommended if you intend to apply
#  the same expression to many strings; doing so will save CPU cycles

#  match and search are closely related to findall. While findall returns all matches in a
#  string, search returns only the first match. More rigidly, match only matches at the
#  beginning of the string. As a less trivial example, let’s consider a block of text and a
#  regular expression capable of identifying most email addresses

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern= r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex=re.compile(pattern,flags=re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [18]:
#  search returns a special match object for the first email address in the text. For the
#  above regex, the match object can only tell us the start and end position of the pattern
#  in the string
m=regex.search(text)
print(m)

text[m.start():m.end()]

<re.Match object; span=(5, 20), match='dave@google.com'>


'dave@google.com'

In [22]:
#  regex.match returns None, as it only will match if the pattern occurs at the start of the
#  string
print(regex.match(text))

None


In [23]:
# Relatedly, sub will return a new string with occurrences of the pattern replaced by the
#  a new string

print(regex.sub('fool',text))

Dave fool
Steve fool
Rob fool
Ryan fool



In [24]:
#  Suppose you wanted to find email addresses and simultaneously segment each address
#  into its 3 components: username, domain name, and domain suffix. To do this, put
#  parentheses around the parts of the pattern to segment:
    
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

regex=re.compile(pattern,flags=re.IGNORECASE)

#  A match object produced by this modified regex returns a tuple of the pattern compo
# nents with its groups method

m=regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [25]:
# findall returns a list of tuples when the pattern has groups:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [26]:
# sub also has access to groups in each match using special symbols like \1, \2, etc.:

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [27]:
# There is much more to regular expressions in Python, most of which is outside the
#  book’s scope. To give you a flavor, one variation on the above email regex gives names
#  to the match groups:

regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)

#  The match object produced by such a regex can produce a handy dict with the specified
#  group names:

m = regex.match('wesm@bright.net')
m.groupdict()

{'username': 'wesm', 'domain': 'bright', 'suffix': 'net'}

In [None]:
#  Table 7-4. Regular expression methods
#  Argument                      Description

#  findall, finditer             Return all non-overlapping matching patterns in a string. findall returns a list of all
#                                patterns while finditer returns them one by one from an iterator.
    
#  match                          Match pattern at start of string and optionally segment pattern components into groups.
#                                 If the pattern matches, returns a match object, otherwise None.

#  search                         Scan string for match to pattern; returning a match object if so. Unlike match, the match
#                                 can be anywhere in the string as opposed to only at the beginning.
    
#  split                          Break string into pieces at each occurrence of pattern.
   
#  sub, subn                      Replace all (sub) or first n occurrences (subn) of pattern in string with replacement
#                                 expression. Use symbols \1, \2, ... to refer to match group elements in the re
#                                 placement string.
    

In [30]:
# Vectorized string functions in pandas

#  Cleaning up a messy data set for analysis often requires a lot of string munging and
#  regularization. To complicate matters, a column containing strings will sometimes have
#  missing data:

data = {'Dave': 'dave@google.com', 
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 
        'Wes': np.nan}
data=pd.Series(data)
print(data)

data.isnull()

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object


Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [31]:
# String and regular expression methods can be applied (passing a lambda or other func
# tion) to each value using data.map, but it will fail on the NA. To cope with this, Series
#  has concise methods for string operations that skip NA values. These are accessed
#  through Series’s str attribute; for example, we could check whether each email address
#  has 'gmail' in it with str.contains

data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [33]:
# Regular expressions can be used, too, along with any re options like IGNORECASE
print(pattern)

data.str.findall(pattern,flags=re.IGNORECASE)

([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})


Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [40]:
#  There are a couple of ways to do vectorized element retrieval. Either use str.get or
#  index into the str attribute:
matches=data.str.match(pattern,flags=re.IGNORECASE)
print(matches)

# matches.str.get(1))    NOT WORKING
# print('')
# matches.str[0]

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object


In [41]:
#  You can similarly slice strings using this syntax:

data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [None]:
# Table 7-5. Vectorized string methods PAGE:212

In [None]:
# END