#  Pandas string functions
You might wonder, why we need to bother with string functions from pandas and not just use the Python standard ones? The reason is that Python's string functions are for individual string objects, while the pandas functions are for Series and DataFrames. So you can think of the pandas string functions as an extension that allows us to operate on an entire Series or DataFrame of strings. As most of the time, the text data that we will be working with will already be in the form of a Series or a DataFrame, so using the specific functions from pandas will make our life a lot easier.

In [4]:
import pandas as pd
import numpy as np

s=pd.Series(['0', 'John Wood', 'Colin Welsh', 'my list', '02456', np.nan, 'HELLO WORLD', 'water%'])
s

0              0
1      John Wood
2    Colin Welsh
3        my list
4          02456
5            NaN
6    HELLO WORLD
7         water%
dtype: object

In [5]:
s.str.lower()

0              0
1      john wood
2    colin welsh
3        my list
4          02456
5            NaN
6    hello world
7         water%
dtype: object

In [6]:
s.str.upper()

0              0
1      JOHN WOOD
2    COLIN WELSH
3        MY LIST
4          02456
5            NaN
6    HELLO WORLD
7         WATER%
dtype: object

In [7]:
s.str.len()

0     1.0
1     9.0
2    11.0
3     7.0
4     5.0
5     NaN
6    11.0
7     6.0
dtype: float64

In [8]:
s.str.split(' ')

0               [0]
1      [John, Wood]
2    [Colin, Welsh]
3        [my, list]
4           [02456]
5               NaN
6    [HELLO, WORLD]
7          [water%]
dtype: object

In [9]:
substrings = s.str.split(' ', expand=True)
substrings

Unnamed: 0,0,1
0,0,
1,John,Wood
2,Colin,Welsh
3,my,list
4,02456,
5,,
6,HELLO,WORLD
7,water%,


In [10]:
substrings[1]

0     None
1     Wood
2    Welsh
3     list
4     None
5      NaN
6    WORLD
7     None
Name: 1, dtype: object

In [11]:
s.str.replace('strA','strB')

0              0
1      John Wood
2    Colin Welsh
3        my list
4          02456
5            NaN
6    HELLO WORLD
7         water%
dtype: object

In [12]:
s.str.replace('%',' percent ')

0                 0
1         John Wood
2       Colin Welsh
3           my list
4             02456
5               NaN
6       HELLO WORLD
7    water percent 
dtype: object

In [13]:
s.str.replace('%','')

0              0
1      John Wood
2    Colin Welsh
3        my list
4          02456
5            NaN
6    HELLO WORLD
7          water
dtype: object

In [14]:
s.str[0:2]

0      0
1     Jo
2     Co
3     my
4     02
5    NaN
6     HE
7     wa
dtype: object

In [15]:
s.str.slice(0,2)

0      0
1     Jo
2     Co
3     my
4     02
5    NaN
6     HE
7     wa
dtype: object

In [16]:
# str.slice_replace(i,j,'str')

s.str.slice_replace(0,2, '___')

0             ___
1      ___hn Wood
2    ___lin Welsh
3        ___ list
4          ___456
5             NaN
6    ___LLO WORLD
7         ___ter%
dtype: object

In [17]:
flag = s.str.contains('0')
flag

0     True
1    False
2    False
3    False
4     True
5      NaN
6    False
7    False
dtype: object

In [18]:
flag = s.str.contains('0', na=False)
flag

0     True
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool

In [19]:
s[flag]

0        0
4    02456
dtype: object

# Cleaning up the movies dataset

In [20]:
import pandas as pd
import numpy as np

movies = pd.read_csv('tmdb_5000_movies.csv')
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [21]:
genres=movies['genres']

In [22]:
genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

We would like to replace this entry with just the names of the genres separated by a comma such as

 'Action, Adventure, Fantasy, Science Fiction' 

How can we go about this? Since each entry is a JSON string, we could use the json module

In [23]:
import json

json_obj = json.loads(genres[0]) # Load json string
names = [x['name'] for x in json_obj] # ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
', '.join(names) # 'Action, Adventure, Fantasy, Science Fiction'

'Action, Adventure, Fantasy, Science Fiction'

In [24]:
def  transform(s):
    s=s.str.strip('[]')
    return(s)

In [25]:
genres= transform(genres)
genres[0]

'{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}'

In [32]:
def transform(s):
    s=s.str.strip('[]')
    s=s.str.replace('{','')
    s=s.str.replace('}','')
    s=s.str.replace(',','')
    s=s.str.replace('\"id\":','')
    s=s.str.replace('\"name\":','')
    s=s.str.replace('"','')
    s=s.str.replace('0','')
    s=s.str.replace('1','')
    s=s.str.replace('2','')
    s=s.str.replace('3','')
    s=s.str.replace('4','')
    s=s.str.replace('5','')
    s=s.str.replace('6','')
    s=s.str.replace('7','')
    s=s.str.replace('8','')
    s=s.str.replace('9','')
    s=s.str.replace('    ',', ')
    s=s.str.replace('   ','')
    return s

In [33]:
genres= transform(genres)
genres[0]

'Action, Adventure, Fantasy, Science Fiction'

In [35]:
movies['genres']=genres

In [36]:
movies.loc[:,['title','genres']].head(10)

Unnamed: 0,title,genres
0,Avatar,Action Adventure Fantasy Science Fiction
1,Pirates of the Caribbean: At World's End,Adventure Fantasy Action
2,Spectre,Action Adventure Crime
3,The Dark Knight Rises,Action Crime Drama Thriller
4,John Carter,Action Adventure Science Fiction
5,Spider-Man 3,Fantasy Action Adventure
6,Tangled,Animation Family
7,Avengers: Age of Ultron,Action Adventure Science Fiction
8,Harry Potter and the Half-Blood Prince,Adventure Fantasy Family
9,Batman v Superman: Dawn of Justice,Action Adventure Fantasy


# Further practice with the movies dataset

Task: transform the entries of the column keywords so that they each contain the first 3 keywords separated by a comma. For example the entry

In [38]:
movies.keywords[0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

**should become 'culture clash, future, space war'.**

In [52]:
keywords = movies['keywords']


In [53]:
keywords = transform(keywords)
keywords.head()

0    culture clash, future, space war, space colony...
1    ocean, drug abuse, exotic island, east india t...
2    spy, based on novel, secret agent, sequel, mi,...
3    dc comics, crime fighter, terrorist, secret id...
4    based on novel, mars, medallion, space travel,...
Name: keywords, dtype: object

In [54]:
keywords_df = keywords.str.split(',' , expand = True)
keywords_df[0:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,culture clash,future,space war,space colony,society,space travel,futuristic,romance,space,alien,...,,,,,,,,,,
1,ocean,drug abuse,exotic island,east india trading company,love of one's life,traitor,shipwreck,strong woman,ship,alliance,...,,,,,,,,,,
2,spy,based on novel,secret agent,sequel,mi,british secret service,united kingdom,,,,...,,,,,,,,,,


In [58]:
movies['keywords'] = keywords_df[0] + ', ' + keywords_df[1] + ', ' + keywords_df[2]
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,"culture clash, future, space war",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,"ocean, drug abuse, exotic island",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,"spy, based on novel, secret agent",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,"dc comics, crime fighter, terrorist",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,"based on novel, mars, medallion",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


# Regular expressions

In [59]:
s=pd.Series(['0', 'John Wood', 'Colin Welsh', 'my list', '02456', np.nan, 'HELLO WORLD', 'water%'])

In [60]:
s.str.contains('John')

0    False
1     True
2    False
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [61]:
s.str.contains('John') | s.str.contains('Colin')

0    False
1     True
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

In [62]:
s.str.contains('John|Colin')

0    False
1     True
2     True
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [63]:
s2 = pd.Series(['bar', 'sugar', 'cartoon', 'argon'])

In [64]:
s2.str.contains('.ar')

0     True
1     True
2     True
3    False
dtype: bool

In [65]:
s2.str.contains('[bc]ar')

0     True
1    False
2     True
3    False
dtype: bool

We can also specify inside the square brackets what kind of characters we want to match as follows:

- [a-z] - match any lowercase letter
- [A-Z] - match any uppercase letter
- [0-9] - match any digit
- [a-zA-Z0-9] - match any letter or digit

In [68]:
s[s.str.contains('[0-9]', na=False)]

0        0
4    02456
dtype: object

- [^a-z] - match any character that is not a lowercase letter
- [^A-Z] - match any character that is not a uppercase letter
- [^0-9] - match any character that is not a digit
- [^a-zA-Z0-9] - match any character that is not a letter or digit

- \d - match any digit
- \D - match any non digit
- \w - match a word character
- \W - match a non-word character
- \s - match whitespace (spaces, tabs, newlines, etc.)
- \S - match non-whitespace

In [70]:
s[s.str.contains('[\d]', na=False)]

0        0
4    02456
dtype: object

#### Matching at the start and end of strings
We can also specify the location of the string where we want to match by using

- ^ - match at the beginning of a string
- $ - searches for matches at the end of a string

In [71]:
s2[s2.str.contains('^[bc]', na=False)]

0        bar
2    cartoon
dtype: object

In [72]:
s2[s2.str.contains('ar$', na=False)]

0      bar
1    sugar
dtype: object

#### Matching preceding characters
Often we want to mention a certain character and then ask to match one or more copies of this character. We can do this using the following metacharacters

In [75]:
s3= pd.Series(['forest', 'o', 'ff', 'foo', 'fof'])
s3.str.contains('f+o?f+')
# What this does is search for all strings that contain 1 or more f's then an optional o and then 1 or more f's. We can see that the third and fifth strings satisfy this as shown in the output

0    False
1    False
2     True
3    False
4     True
dtype: bool

An important thing to know is that the backslash character \ lets us escape regular expressions, for situations where we want to match the metacharacter itself. For example, if we want to match periods we cannot just use . since this will match any character as we mentioned before. We must use instead \..

#### Grouping
We can place parentheses around a regular expression to allow us to group the results so that we can extract each component separately instead of the full match. This can be especially useful if we want to use the str.extract() method since in this case we must have the matches grouped so that they can be extracted in a new DataFrame

In [86]:
s4= pd.Series(['Monday5km', 'Wednesday10km', 'Saturday25km'])

In [92]:
s4.str.extract('(\w+day)',expand=True)

Unnamed: 0,0
0,Monday
1,Wednesday
2,Saturday


Let's break the regular expression 
\w+day
 down:

- \w
: matches a word character once (it is equivalent to 
[a-zA-Z0-9_]
).
- If you add the 
+
 quantifier, this will match the preceding character 1 or more times. So, 
\w+
 will match word characters 1 or more times.
- day
: matches the characters "day" literally (case sensitive).
Altogether, 
\w+day
 will match any word characters preceding the string "day", and then the string "day". It won't match anything after the string "day".

I hope it is now clear why 
(\w)
 will match only one word character.

The regular expression pattern: 
\w+y
, will match any word characters preceding the string "y", and then the string "y". In practice, this will match the whole day name, without matching any characters after the string "y".

Note that the command would not have worked had we not used the parentheses to indicate that we want to group the matches. That is every time we use the str.extract() function we must use this option to group the results.

Grouping the results also means that we can refer to them. Let's look at a particular example where we want to take each match of the previous regular expression '\w+day' and now replace each string the first three letters so that we have the abbreviated names 'Mon', 'Wed' and 'Sat'. For this we can use the str.replace() function. Normally we would need to provide a fixed string by which to replace every match. However, if we choose to group our matches using parentheses then we have the option to specify a function which gives a separate replacement string to each match in the group. In our case this function has to take the first three characters of each string. We define the function as follows

In [93]:
def f(x):
    return x.groups()[0][:3]

**The groups attribute refers to the fact that the matches are grouped, and now we index the first and only group in this case and ask for the first three characters to be returned for each match.**

In [94]:
s4.str.replace('(\w+day)', f)

0     Mon5km
1    Wed10km
2    Sat25km
dtype: object

# Exercise: using regular expressions in pandas

In [95]:
meal_plan = ['Monday: 9:12am – Omelet,  3:30pm– Apple slices with almond butter', 
             'Tuesday: 9:35am – Banana bread, 11:00am –Sauteed veggies, 7:02pm– Taco pie',
             'Wednesday: 9:00am – Banana pancakes',  
             'Thursday: 7:23pm– Slow cooker pulled pork', 'Friday: 3:30pm – Can of tuna', 
             'Saturday: 9:11am: Eggs and sweet potato hash browns, 3:22pm: Almonds', 
             'Sunday: 11:00am: Meat and veggie stir fry'] 

In [96]:
df = pd.DataFrame(meal_plan, columns=['text'])
df

Unnamed: 0,text
0,"Monday: 9:12am – Omelet, 3:30pm– Apple slices..."
1,"Tuesday: 9:35am – Banana bread, 11:00am –Saute..."
2,Wednesday: 9:00am – Banana pancakes
3,Thursday: 7:23pm– Slow cooker pulled pork
4,Friday: 3:30pm – Can of tuna
5,Saturday: 9:11am: Eggs and sweet potato hash b...
6,Sunday: 11:00am: Meat and veggie stir fry


In [128]:
sol = df['text'].str.extractall(('(\d?\d):(\d\d) ?([ap]m)'))
sol

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,9,12,am
0,1,3,30,pm
1,0,9,35,am
1,1,11,0,am
1,2,7,2,pm
2,0,9,0,am
3,0,7,23,pm
4,0,3,30,pm
5,0,9,11,am
5,1,3,22,pm


In [129]:
days=['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
meals = ['breakfast','lunch','dinner']

In [131]:
sol.index.set_levels([days,meals],inplace = True)
sol.index.set_names(['Day','Meal'], inplace = True)

In [133]:
sol

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Day,Meal,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mon,breakfast,9,12,am
Mon,lunch,3,30,pm
Tue,breakfast,9,35,am
Tue,lunch,11,0,am
Tue,dinner,7,2,pm
Wed,breakfast,9,0,am
Thu,breakfast,7,23,pm
Fri,breakfast,3,30,pm
Sat,breakfast,9,11,am
Sat,lunch,3,22,pm


In [135]:
sol.columns=['Hour','Minutes','Period']

In [136]:
sol

Unnamed: 0_level_0,Unnamed: 1_level_0,Hour,Minutes,Period
Day,Meal,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mon,breakfast,9,12,am
Mon,lunch,3,30,pm
Tue,breakfast,9,35,am
Tue,lunch,11,0,am
Tue,dinner,7,2,pm
Wed,breakfast,9,0,am
Thu,breakfast,7,23,pm
Fri,breakfast,3,30,pm
Sat,breakfast,9,11,am
Sat,lunch,3,22,pm
