# Text Methods

A normal Python string has a variety of method calls available:

In [1]:
mystring = 'hello'

In [2]:
mystring.capitalize()

'Hello'

In [3]:
mystring.upper()

'HELLO'

In [4]:
mystring.isdigit()

False

In [5]:
'123'.isdigit()

True

In [None]:
# help(str)

# Pandas and Text

Pandas can do a lot more than what we show here. Full online documentation on things like advanced string indexing and regular expressions with pandas can be found here: https://pandas.pydata.org/docs/user_guide/text.html

## Text Methods on Pandas String Column

In [57]:
import pandas as pd
import numpy as np

In [7]:
names = pd.Series(['andrew','bobo','claire','david','4'])

In [8]:
names

0    andrew
1      bobo
2    claire
3     david
4         4
dtype: object

In [10]:
my_name = 'vijaya'

In [11]:
my_name.capitalize()

'Vijaya'

In [14]:
names.str.capitalize()

0    Andrew
1      Bobo
2    Claire
3     David
4         4
dtype: object

In [15]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [16]:
names

0    andrew
1      bobo
2    claire
3     david
4         4
dtype: object

In [17]:
names.apply(str.capitalize) # not frequently used

0    Andrew
1      Bobo
2    Claire
3     David
4         4
dtype: object

## Splitting , Grabbing, and Expanding

In [18]:
tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']  # list of two strings

In [19]:
len(tech_finance)

2

In [20]:
tickers = pd.Series(tech_finance)

In [21]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [23]:
'GOOG,APPL,AMZN'.split(',')

['GOOG', 'APPL', 'AMZN']

In [24]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [22]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [25]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [26]:
tickers.str.split(',').str[2]

0    AMZN
1      GS
dtype: object

In [27]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [28]:
tickers.str.split(',',expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [30]:
student_list = ['Vishnu,Gopi,Yaswini','Yaswini,Hanugna,Srinivas,Ajay']

In [31]:
students = pd.Series(student_list)

In [32]:
students

0              Vishnu,Gopi,Yaswini
1    Yaswini,Hanugna,Srinivas,Ajay
dtype: object

In [33]:
students.str.split(',')

0               [Vishnu, Gopi, Yaswini]
1    [Yaswini, Hanugna, Srinivas, Ajay]
dtype: object

In [34]:
students.str.split(',',expand=True)

Unnamed: 0,0,1,2,3
0,Vishnu,Gopi,Yaswini,
1,Yaswini,Hanugna,Srinivas,Ajay


## Cleaning or Editing Strings

In [35]:
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])

In [36]:
# Notice the "mis-alignment" on the right hand side due to spacing in "andrew  " and "  claire  "
messy_names

0      andrew  
1         bo;bo
2      claire  
dtype: object

In [37]:
messy_names.str.replace(";","")

0      andrew  
1          bobo
2      claire  
dtype: object

In [40]:
' Vijaya     '.strip()

'Vijaya'

In [41]:
messy_names.str.strip()

0    andrew
1     bo;bo
2    claire
dtype: object

In [43]:
messy_names.str.replace(";","").str.strip()

0    andrew
1      bobo
2    claire
dtype: object

In [44]:
messy_names.str.replace(";","").str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

In [45]:
test_names = messy_names.str.replace(";","").str.strip().str.capitalize()

In [46]:
test_names

0    Andrew
1      Bobo
2    Claire
dtype: object

In [47]:
test_names[1]

'Bobo'

In [48]:
len(test_names[1])

4

In [61]:
# my_name.*?

## Alternative with Custom apply() call

In [54]:
def cleanup(name):
    print(f"Input name: {name}")
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    print(f"Output name: {name}")
    return name

In [52]:
messy_names

0      andrew  
1         bo;bo
2      claire  
dtype: object

In [55]:
messy_names.apply(cleanup)

Input name: andrew  
Output name: Andrew
Input name: bo;bo
Output name: Bobo
Input name:   claire  
Output name: Claire


0    Andrew
1      Bobo
2    Claire
dtype: object

In [59]:
np.vectorize(cleanup)(messy_names)

Input name: andrew  
Output name: Andrew
Input name: andrew  
Output name: Andrew
Input name: bo;bo
Output name: Bobo
Input name:   claire  
Output name: Claire


array(['Andrew', 'Bobo', 'Claire'], dtype='<U6')