In [1]:
# Regular expressions
expressions <- c("grep", "grepl", "sub", "gsub", "regexpr", "gregexpr", "regmatches", 
                 "strtrim", "agrep", "strsplit")

In [2]:
# grep: detecting
# search for pattern in vector
# returns found elements
grep(pattern="B", x=c("A", "B", "C", "D"))

In [3]:
x <- c("LStat","lstat","Leuven Statistics Research Center", "BNOSAC", "Belgium", "Waffels in Belgium", "Manneken Pis @ Atomium")
grep(pattern="stat", x=x, ignore.case=TRUE)

In [4]:
grep(pattern="stat", x=x, ignore.case=TRUE, invert=TRUE)

In [6]:
grep(pattern="lstat", x=x, ignore.case=TRUE, value=TRUE)

In [7]:
grepl(pattern="lstat", x=x, ignore.case=TRUE)

In [8]:
# ^ means starts with
# $ means ends with
# | means or

x <- c("LStat","lstat","Leuven Statistics Research Center", "BNOSAC", "Belgium", "Waffels in Belgium", "Manneken Pis @ Atomium")
grep(pattern="^Bel", x=x, value=TRUE)

In [9]:
grep(pattern="stat$", x=x, value=TRUE)

In [10]:
grep(pattern="^Bel|Stat$", x=x, value=TRUE)

In [11]:
#Other metacharacters
#. matches everything except for the empty string ””.
#+ The preceding item will be matched one or more times.
#∗ The preceding item will be matched zero or more times.
#? The preceding item is optional and will be matched at most once.
#{n} The preceding item is matched exactly n times.
#{n,} The preceding item is matched n or more times.
#{n,m} The preceding item is matched at least n times, but not more than m times.
#ˆ matches the empty string at the at the beginning of a line. When used in a character class (see explanation about character classes in the following slides) means to match any character but the following ones. 
# use () for grouping and [,] for character class brackets

In [13]:
print(grep(pattern="Go{4}l", x=c("Gooool","Goooool"), value=TRUE))
print(grep(pattern="Go+a", x=c("Gooooaaal","Goooooobbbl"), value=TRUE))
print(grep(pattern="^GG?o*a", x=c("Gooooaaal","GGooooaaal"), value=TRUE))
print(grep(pattern="G.*a", x=c("Gooooaaal","Gooaaal"), value=TRUE))
print(grep(pattern="(Ik haat){2}", x=c("Ik haat smurfen","Ik haatIk haat smurfen"), value=TRUE))
# don't forget to escape the metacharacters with \\

[1] "Gooool"
[1] "Gooooaaal"
[1] "Gooooaaal"  "GGooooaaal"
[1] "Gooooaaal" "Gooaaal"  
[1] "Ik haatIk haat smurfen"


In [14]:
# character classess
# example: [a-zA-Z]
# [:lower:] lower case in the current locale
# [:upper:]
# alphabetic characters [:alpha:]
# space [:space:]
# digits [:digit:]
# punctuation chars [:punct:]

# examples
input1 <- c("nose", "letter38", "window9", "apple0")
print(grep("[[:digit:]]", input1, value = TRUE))
print(grep("[nco]", input1, value = TRUE))
print(grep("[39]", input1, value = TRUE))

input2 <- c("abcdef", "ABCDEFG", "IJK")
print(grep("[a-cA-D]", input2, value = TRUE))
print(grep("[[:lower:]]", input2, value = TRUE))

[1] "letter38" "window9"  "apple0"  
[1] "nose"    "window9"
[1] "letter38" "window9" 
[1] "abcdef"  "ABCDEFG"
[1] "abcdef"


In [15]:
# Replacing

# sub : replacement of the first match
# gsub : replacement of all matches

print(gsub(pattern="Statistics", replacement="Statistiek", x="Leuven Statistics Research"))
print(gsub(pattern="Sta|Research", replacement="", x="Leuven Statistics Research"))
print(gsub(pattern=" +", replacement=" ", x="Leuven Statistics"))
print(sub(pattern="\\.", replacement="", x="abc...def"))
print(gsub(pattern="\\.", replacement="", x="abc...def"))

[1] "Leuven Statistiek Research"
[1] "Leuven tistics "
[1] "Leuven Statistics"
[1] "abc..def"
[1] "abcdef"


In [17]:
# backreference to replace with (group)
x <- "I want to break free"
print(gsub(pattern = "(.+)(break)(.+)", replacement = "\\1", x))
print(gsub(pattern = "(.+)(break)(.+)", replacement = "\\2", x))
print(gsub(pattern = "(.+)(break)(.+)", replacement = "\\3", x))

[1] "I want to "
[1] "break"
[1] " free"


In [18]:
# regexpr and gregexpr to have infromation on where matches are found
# regexpr gives the starting position of the first match
# gregexpr gives the starting positions of every match

print(gregexpr(pattern="\\.", text="abc...def"))
txt <- "Leuven Statistics Research Center" 
print(gregexpr(pattern="Sta|Research", text=txt))
print(regexpr(pattern="Sta|Research", text=txt))

[[1]]
[1] 4 5 6
attr(,"match.length")
[1] 1 1 1
attr(,"useBytes")
[1] TRUE

[[1]]
[1]  8 19
attr(,"match.length")
[1] 3 8
attr(,"useBytes")
[1] TRUE

[1] 8
attr(,"match.length")
[1] 3
attr(,"useBytes")
[1] TRUE


In [20]:
# result of gregexpr can be fed to regmatches to extract or replace the text
txt <- "Leuven Statistics Research Center"
print(regmatches(x=txt, gregexpr(pattern="Sta.+ |Research", text=txt), invert=FALSE))
print(regmatches(x=txt, gregexpr(pattern="Sta.+ |Research", text=txt), invert=TRUE))
# TODO: the following one does not work as in the example
print(regmatches(x=txt, regexpr(pattern="Statis", text=txt)) <- "Analy")

[[1]]
[1] "Statistics Research "

[[1]]
[1] "Leuven " "Center" 

[1] "Analy"


In [21]:
# strsplit splits text by character or expression
print(strsplit(x=c("abc. def", ""), split=" "))
print(strsplit(x=c("abc. def", ""), split="\\."))

[[1]]
[1] "abc." "def" 

[[2]]
character(0)

[[1]]
[1] "abc"  " def"

[[2]]
character(0)



In [22]:
# substr and strtrim to extract element from character data
print(substr(x=c("abc. def", "123456789"), start=3, stop=4))
print(strtrim(x=c("abc", "123456789"), width=4))

[1] "c." "34"
[1] "abc"  "1234"


In [45]:
# Exercises

d1 <- data.frame(id...of....patient = c(1, 2), patient....age = c(1, 2))
d <- data.frame( id = c(11, 22, 33, 44, 55, 66, 77), drug = c("vitamin E", "vitamin ESTER-C", " vitamin E ", "vitamin E(ointment)", "", "provitamin E\n", "vit E"), text = c("", " ", "3 times a day after meal", "once a day", " ", "\t", "\n "), stringsAsFactors = FALSE)


In [46]:
# Exercise 1
# replace teh leading and trailing spaces

trim <- function(input) {
    gsub(pattern = "^([:space:]+)([:alpha:]+.+[:alpha:]+)([:space:]+)$", replacement = "\\2", input)
}
d[["drug"]] <- trim(d[["drug"]])

In [47]:
# Exercise 2: replace the names of columns with good ones, using regex
names(d1)

In [48]:
names(d1) <- gsub(pattern = "(\\.)+", replacement = " ", names(d1))
names(d1)

In [49]:
# Exercise 3 : Define drugs with the character strings with only spaces as missing data (NA)
d[["drug"]][grepl(pattern="^[:space:]*$", x=d[["drug"]], ignore.case=TRUE)] <- NA
d[["drug"]]

In [51]:
# exercise 4 : get all drugs with 'vitamin E' only
d[["drug"]][grepl(pattern = "^vitamin E$", x=d[["drug"]])]

In [52]:
# exercise 5: extract email provider from email
email <- "my.email@hotmail.com"

gsub(pattern = "^(.+)@(.+)$", x=email, replacement = "\\2")