# <b>Vectorized String Operations</b>

In [42]:
import numpy as np
import pandas as pd
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [43]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [44]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s if s is None else s.capitalize() for s in data]

['Peter', 'Paul', None, 'Mary', 'Guido']

In [45]:
names = pd.Series(data)
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [46]:
# names.shape #(5,)
# type(names)     #pandas.core.series.Series

In [47]:
names.isna()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [48]:
names.isna().any()

True

In [49]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [50]:
monte.str.lower()
# .shape #(6,)

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [51]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [52]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [53]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [54]:
monte.str.split('m')
# type(monte.str.split()) #pandas.core.series.Series

0    [Graha,  Chap, an]
1         [John Cleese]
2      [Terry Gillia, ]
3           [Eric Idle]
4         [Terry Jones]
5       [Michael Palin]
dtype: object

In [55]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte.str.extract('([A-Za-z]+)', expand=False)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [56]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [57]:
monte.str.slice(0, 3)

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [64]:
monte.str.get(3)

0    h
1    n
2    r
3    c
4    r
5    h
dtype: object

In [58]:
monte.str[:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [59]:
a = "Lewis Hamilton"
a[:3]

'Lew'

In [67]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [66]:
monte.str.split().str[-1]

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [None]:
full_monte = pd.DataFrame({'name': monte,
'info': ['B|C|D', 'B|D', 'A|C',
'B|D', 'B|C', 'B|C|D']})
full_monte
# .shape    (6, 2)

(6, 2)

In [73]:
full_monte['name']

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
Name: name, dtype: object

In [76]:
full_monte.iloc[1, 1]

'B|D'

In [91]:
full_monte.iloc[1]

name    John Cleese
info            B|D
Name: 1, dtype: object

In [79]:
full_monte.loc[0]
# .shape        (2,)

name    Graham Chapman
info             B|C|D
Name: 0, dtype: object

In [81]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [82]:
# Set the repo URL
repo = "https://raw.githubusercontent.com/jakevdp/open-recipe-data/master"

# Create 'data' directory if it doesn't exist, download the file, and unzip it
!mkdir -p data && curl -o data/recipeitems.json.gz {repo}/recipeitems.json.gz && gunzip -f data/recipeitems.json.gz


A subdirectory or file data already exists.
Error occurred while processing: data.


In [89]:
import os
import gzip
import shutil
'''
os: Lets you interact with the operating system (e.g., check if folders exist, create folders).
gzip: Used to read/write .gz compressed files in Python.
shutil: Lets you copy file-like objects efficiently, which is handy for decompressing files.
'''
# Create the 'data' folder if it doesn't exist
if not os.path.exists("data"):
    os.mkdir("data")

# Download and unzip the file using curl and gunzip
!curl -o data/recipeitems.json.gz https://raw.githubusercontent.com/jakevdp/open-recipe-data/master/recipeitems.json.gz
'''
This shell command downloads the file from GitHub into your data/ folder and saves it as recipeitems.json.gz.
! is used in Jupyter to run shell commands.
-o specifies the output file name.
'''
# !gunzip -f data/recipeitems.json.gz


#Decompress the .gz file using Python
with gzip.open('data/recipeitems.json.gz', 'rb') as f_in:
    with open('data/recipeitems.json', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
'''
This replaces the CLI command gunzip with pure Python code.
gzip.open(...) opens the compressed file for reading (rb = read binary).
open(..., 'wb') opens the output file for writing (wb = write binary).
shutil.copyfileobj efficiently copies the content from the compressed file to the new, uncompressed file.
'''

recipes = pd.read_json('data/recipeitems.json', lines=True)
recipes.shape
'''
This uses pandas to read the .json file.
lines=True means that each line in the file is a separate JSON object (common in large datasets for easier parsing).
'''

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 29.3M    0  145k    0     0   119k      0  0:04:10  0:00:01  0:04:09  120k
  3 29.3M    3 1137k    0     0   516k      0  0:00:58  0:00:02  0:00:56  516k
  7 29.3M    7 2198k    0     0   686k      0  0:00:43  0:00:03  0:00:40  686k
 10 29.3M   10 3270k    0     0   778k      0  0:00:38  0:00:04  0:00:34  778k
 14 29.3M   14 4326k    0     0   831k      0  0:00:36  0:00:05  0:00:31  893k
 17 29.3M   17 5398k    0     0   870k      0  0:00:34  0:00:06  0:00:28 1052k
 21 29.3M   21 6470k    0     0   897k      0  0:00:33  0:00:07  0:00:26 1065k
 23 29.3M   23 7094k    0     0   862k      0  0:00:34  0:00:08  0:00:26  975k
 27 29.3M   27 8246k    0     0   894k      0  0:00

(173278, 17)

In [92]:
#looking at the first row:
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
name                                    Drop Biscuits and Sausage Gravy
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
url                   http://thepioneerwoman.com/cooking/2013/03/dro...
image                 http://static.thepioneerwoman.com/cooking/file...
ts                                             {'$date': 1365276011104}
cookTime                                                          PT30M
source                                                  thepioneerwoman
recipeYield                                                          12
datePublished                                                2013-03-11
prepTime                                                          PT10M
description           Late Saturday afternoon, after Marlboro Man ha...
totalTime                                                           NaN
creator                                                         

In [96]:
recipes['ingredients'].dtype

dtype('O')

In [97]:
recipes['ingredients'].head()

0    Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
1    12 whole Dinner Rolls Or Small Sandwich Buns (...
2    Dressing:\n1 tablespoon cumin seeds\n1/3 cup /...
3    Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
4    For each bowl: \na big dollop of Greek yogurt\...
Name: ingredients, dtype: object

In [100]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [101]:
len("3 eggs\n2 cups flour")
# 19 characters

19

In [102]:
recipes.ingredients.isna().any()

False

In [103]:
recipes.isna().any()

_id                   False
name                  False
ingredients           False
url                   False
image                  True
ts                    False
cookTime               True
source                False
recipeYield            True
datePublished          True
prepTime               True
description            True
totalTime              True
creator                True
recipeCategory         True
dateModified           True
recipeInstructions     True
dtype: bool

In [109]:
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [110]:
recipes.ingredients.str.contains('[Cc]innamon').sum()

10526

In [118]:
recipes.ingredients.str.contains('[Ff]eri').sum()

2

In [112]:
recipes.name[np.argmax(recipes.ingredients.str.len())]

'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [115]:
recipes.ingredients[recipes.ingredients.str.contains('[Ff]eri', na=False)]

7101      15g Strutto (pork fat)\n350g Ferina (flour)\nA...
170907    14 ounces, fluid Canned Green Enchilada Sauce\...
Name: ingredients, dtype: object

In [116]:
recipes.ingredients[recipes.ingredients.str.contains('[Nn]ahid', na=False)]

Series([], Name: ingredients, dtype: object)

In [121]:
import re
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']
spice_df = pd.DataFrame({
spice: recipes.ingredients.str.contains(spice, re.IGNORECASE)
for spice in spice_list})
spice_df.head()
# spice_df.shape  #(173278, 10)

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [None]:
selection = spice_df.query('parsley & paprika & tarragon')
selection
# len(selection) #10

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
2069,False,True,False,False,True,False,True,False,True,False
74964,False,False,False,False,True,False,True,False,True,False
93768,True,True,False,True,True,False,True,False,True,False
113926,True,True,False,False,True,False,True,False,True,False
137686,True,True,False,False,True,False,True,False,True,False
140530,True,True,False,False,True,False,True,True,True,False
158475,True,True,False,False,True,False,True,False,True,True
158486,True,True,False,False,True,False,True,False,True,False
163175,True,True,True,False,True,False,True,False,True,False
165243,True,True,False,False,True,False,True,False,True,False


In [124]:
selection.index

Index([2069, 74964, 93768, 113926, 137686, 140530, 158475, 158486, 163175,
       165243],
      dtype='int64')

In [129]:
recipes.name[[0, 1]]
# .shape    (2,)

0    Drop Biscuits and Sausage Gravy
1          Hot Roast Beef Sandwiches
Name: name, dtype: object