# Text Methods

A normal Python string has a variety method calls available:

In [1]:
mystring='hello'

In [2]:
mystring.capitalize()

'Hello'

In [3]:
mystring

'hello'

In [4]:
mystring.isdigit()

False

In [5]:
help(str)

Help on class str in module builtins:

class str(object)
 |  str(object='') -> str
 |  str(bytes_or_buffer[, encoding[, errors]]) -> str
 |  
 |  Create a new string object from the given object. If encoding or
 |  errors is specified, then the object must expose a data buffer
 |  that will be decoded using the given encoding and error handler.
 |  Otherwise, returns the result of object.__str__() (if defined)
 |  or repr(object).
 |  encoding defaults to sys.getdefaultencoding().
 |  errors defaults to 'strict'.
 |  
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __contains__(self, key, /)
 |      Return key in self.
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __format__(self, format_spec, /)
 |      Return a formatted version of the string as described by format_spec.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  

# Pandas and Texts

## Text Methods on Pandas String Column

In [6]:
import pandas as pd

In [7]:
names=pd.Series(['abdullah','muhammad','omar','uthman','4'])

In [8]:
names

0    abdullah
1    muhammad
2        omar
3      uthman
4           4
dtype: object

In [9]:
names.str.capitalize()

0    Abdullah
1    Muhammad
2        Omar
3      Uthman
4           4
dtype: object

In [10]:
names.capitalize()

AttributeError: 'Series' object has no attribute 'capitalize'

In [11]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

## Splitting, Grabbing, and Expanding

In [12]:
tech_finance=['GOOGL,APPL,AMZN','JPM,BAC,GS']

In [13]:
len(tech_finance)

2

In [14]:
tickers=pd.Series(tech_finance)

In [15]:
tickers

0    GOOGL,APPL,AMZN
1         JPM,BAC,GS
dtype: object

In [16]:
tickers.str.split(',')

0    [GOOGL, APPL, AMZN]
1         [JPM, BAC, GS]
dtype: object

In [17]:
tickers.str.split(',').str[0]

0    GOOGL
1      JPM
dtype: object

In [18]:
tickers.str.split(',',expand=True)

Unnamed: 0,0,1,2
0,GOOGL,APPL,AMZN
1,JPM,BAC,GS


## Cleaning or Editing Strings

In [19]:
messy_names=pd.Series(['abdullah  ','muham;mad','  omar  '])

In [20]:
messy_names

0    abdullah  
1     muham;mad
2        omar  
dtype: object

In [22]:
messy_names.str.replace(';','').str.replace(' ','')

0    abdullah
1    muhammad
2        omar
dtype: object

In [23]:
messy_names

0    abdullah  
1     muham;mad
2        omar  
dtype: object

In [24]:
messy_names.str.strip()

0     abdullah
1    muham;mad
2         omar
dtype: object

In [25]:
messy_names.str.replace(';','').str.strip()

0    abdullah
1    muhammad
2        omar
dtype: object

In [27]:
messy_names.str.replace(';','').str.strip().str.capitalize()

0    Abdullah
1    Muhammad
2        Omar
dtype: object

## Alternative with Custom apply() call

In [28]:
def cleanup(name):
    name=name.replace(';','')
    name=name.strip()
    name=name.capitalize()
    return name

In [29]:
messy_names

0    abdullah  
1     muham;mad
2        omar  
dtype: object

In [30]:
messy_names.apply(cleanup)

0    Abdullah
1    Muhammad
2        Omar
dtype: object

In [31]:
def cleanup_str(name):
    name=name.str.replace(';','')
    name=name.str.strip()
    name=name.str.capitalize()
    return name

In [32]:
messy_names

0    abdullah  
1     muham;mad
2        omar  
dtype: object

In [33]:
messy_names.apply(cleanup_str)

AttributeError: 'str' object has no attribute 'str'

## Which one is more efficient?

In [34]:
import timeit 
  
# code snippet to be executed only once 
setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''
  
# code snippet whose execution time is to be measured 
stmt_pandas_str = ''' 
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [37]:
timeit.timeit(setup=setup,stmt=stmt_pandas_str,number=10000)


8.272948599999836

In [40]:
timeit.timeit(setup=setup,stmt=stmt_pandas_apply,number=10000)

2.9874987000002875

In [44]:
timeit.timeit(setup=setup,stmt=stmt_pandas_vectorize,number=10000)

0.7504893000004813

Wow! While .str() methods can be extremely convienent, when it comes to performance, don't forget about np.vectorize()! Review the "Useful Methods" lecture for a deeper discussion on np.vectorize()