## Regular expression in pandas

This notebooks contains examples about how to use regular expressions in pandas. 

Following the lecture 3 of topic 4 in the course PFDA. 

For this notebook I'll use the dataset people-100.csv.

In [190]:
import pandas as pd

datadir = "../datafiles/"
people_filename = datadir + "people-100.csv"

In [191]:
df = pd.read_csv(people_filename)
df.head(10)

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site"


#### Different methods to use for the domains

In [192]:
# Use the str accessor to split the email address into two parts (email and then domain)
# The split() method splits a string into a list.
df['domain'] = df['Email'].str.split('@').str[1]
df.head(10)

# This funtion will add a new column to the dataframe with only the domains.

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org


In [193]:
# Delete the column we just added, so we can keep a look at using regex

df.drop('domain', axis=1, inplace=True)
df.head(10)


Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site"


In [194]:
# Use replace() method of the pattern that can take regex to get the domain.

pattern = r".*@"
df['domain'] = df['Email'].str.replace(pattern, '', regex=True)
df.head(10)

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org


In [195]:
# a more precise pattern
pattern = r".*@([\w\.]+\.\w{2,3})"
df['domain'] = df['Email'].str.replace(pattern, '\\1', regex=True)
df.head(10)

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org


#### The phone numbers

The phone numbers are in different formats, some with hyphens, others with dots and some have extensions. Tp clean these up lets:

* assume that all of them are phone numbers
* put main numbers without any extensions or other characters into another Column
* put the extenstions into a seperate column (numbers after the x)


##### The main numbers

If there is an x I'll take the numbers before without any hyphens or dots.

Tidyng up the phone numbers. Put the main numbers without any extensions into one column and put all the extensions into another column.

In [196]:
# Getting the phone number without the extension, to do this we need to use the_extension_pattern

the_extension_pattern=r"x\d*"
df['clean_phone'] = df['Phone'].str.replace(the_extension_pattern, '', regex=True)
df.head(10)

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,001-084-906-7849
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com,214.112.6044
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com,277.609.7938
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com,584.094.6111
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net,689-207-3558
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,001-171-649-9856
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,+1-773-151-6685
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,001-447-699-7998
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net,603-428-2429
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,+1-511-372-1544


In [197]:
# Next remove the hyphens, dots and round brackets from the phone number

otherchars_pattern=r"[\-\. \(\)]*"
df['clean_phone'] = df['clean_phone'].str.replace(otherchars_pattern, '', regex=True)
df.head(10)

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com,2141126044
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com,2776097938
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com,5840946111
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net,6892073558
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net,6034282429
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544


In [198]:
# Insert the county code (+1) to the phone numbers if they don't have it already.

prefix_pattern=r"^([^\+0].*)" # This pattern will match any phone number that doesn't start with a + or 0 and replace it with +1
df['clean_phone'] = df['clean_phone'].str.replace(prefix_pattern, '+1\\1', regex=True)
df.head(10)

# The phone numbers starting with 00 (as the county code)are not changed.

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com,12141126044
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com,12776097938
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com,15840946111
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net,16892073558
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net,16034282429
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544


In [199]:
# Replace the 00s at the start with a +1.

prefix_pattern=r"^00(.*)"  # This pattern will match any phone number that starts with 00 and will replace it to +1.
df['clean_phone'] = df['clean_phone'].str.replace(prefix_pattern, '+\\1', regex=True)
df.head(10)

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com,12141126044
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com,12776097938
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com,15840946111
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net,16892073558
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net,16034282429
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544


##### The extensions 

Copy the extentions of the phone numbers and add them into a new column.


In [200]:
# Using "str.replace(extension_pattern)" to remove the extension from the phone numbers, into a new column.
# However the phone numbers that don't have an extension will be the same as the original phone number.

extension_pattern=r"^.*x"
df['extension'] = df['Phone'].str.replace(extension_pattern, '', regex=True)
df.head(10)


Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone,extension
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849,73518
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com,12141126044,4913
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com,12776097938,277.609.7938
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com,15840946111,584.094.6111
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net,16892073558,7233
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856,5553
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685,49162
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998,88612
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net,16034282429,27392
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544,8206


In [201]:
# To have only the extension in the new column, we can use the extract() method.

extension_pattern=r"x(\d+)"
df['extension'] = df['Phone'].str.extract(extension_pattern)
df.head(10)

# The phone nuumbers that don't have an extension will have NaN in the new column.

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone,extension
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849,73518.0
1,2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist,example.com,12141126044,4913.0
2,3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath,example.com,12776097938,
3,4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher,example.com,15840946111,
4,5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon,example.net,16892073558,7233.0
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856,5553.0
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685,49162.0
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998,88612.0
8,9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer,example.net,16034282429,27392.0
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544,8206.0


#### Use of filters

How to filter rows in pandas based on a reguler expression. 

Eg (in the lecture) find all the rows that have an international prefix

In [202]:
df[df['Phone'].str.contains(r'^\+|00')]

# in this expression we are looking for phone numbers that start with a + or has a 00 in it. 

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone,extension
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849,73518.0
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856,5553.0
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685,49162.0
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998,88612.0
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544,8206.0
14,15,1F0B7D65A00DAF9,Crystal,Farmer,Male,pmiranda@example.org,+1-024-377-5391,1992-03-09,Agricultural consultant,example.org,10243775391,
15,16,50Bb061cB30B461,Thomas,Knight,Female,braunpriscilla@example.net,+1-360-880-0766,2006-02-18,Sport and exercise psychologist,example.net,13608800766,
19,20,88473e15D5c3cD0,Jared,Mitchell,Female,jcortez@example.com,+1-958-849-6781,1921-01-18,Paediatric nurse,example.com,19588496781,
22,23,cBbBcA0FCA3C4Bc,Randy,Barnes,Male,huangbill@example.org,001-960-629-7164x67214,1947-12-30,Outdoor activities/education manager,example.org,19606297164,67214.0
23,24,f1f89173353aD90,Janice,Rhodes,Female,juarezdominique@example.net,001-249-314-9742x6996,1999-11-01,Drilling engineer,example.net,12493149742,6996.0


In [203]:
df[df['Phone'].str.contains(r'^\+|^00')]

# When using the | operator, we need to use the ^ operator for each condition so here will have the results     
# of the numbers that start with a + or 00.

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title,domain,clean_phone,extension
0,1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer,example.net,10849067849,73518.0
5,6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer,example.org,11716499856,5553.0
6,7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst,example.com,17731516685,49162.0
7,8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist,example.com,14476997998,88612.0
9,10,8e4FB470FE19bF0,Isaiah,Downs,Male,virginiaterrell@example.org,+1-511-372-1544x8206,1964-09-20,"Engineer, site",example.org,15113721544,8206.0
14,15,1F0B7D65A00DAF9,Crystal,Farmer,Male,pmiranda@example.org,+1-024-377-5391,1992-03-09,Agricultural consultant,example.org,10243775391,
15,16,50Bb061cB30B461,Thomas,Knight,Female,braunpriscilla@example.net,+1-360-880-0766,2006-02-18,Sport and exercise psychologist,example.net,13608800766,
19,20,88473e15D5c3cD0,Jared,Mitchell,Female,jcortez@example.com,+1-958-849-6781,1921-01-18,Paediatric nurse,example.com,19588496781,
22,23,cBbBcA0FCA3C4Bc,Randy,Barnes,Male,huangbill@example.org,001-960-629-7164x67214,1947-12-30,Outdoor activities/education manager,example.org,19606297164,67214.0
23,24,f1f89173353aD90,Janice,Rhodes,Female,juarezdominique@example.net,001-249-314-9742x6996,1999-11-01,Drilling engineer,example.net,12493149742,6996.0
