In [11]:
# 7.1 String Manipulation

# Python Built-In String Object Methods

val = "a, b, Yelan"
val.split(",")

['a', ' b', ' Yelan']

In [12]:
pieces = [x.strip() for x in val.split(",")]

pieces

['a', 'b', 'Yelan']

In [13]:
# Make sure 'pieces' is defined
# pieces = [x.strip() for x in val.split(",")]

pertama, kedua, ketiga = pieces

pertama + "::" + kedua + "::" + ketiga

'a::b::Yelan'

In [14]:
# pieces = [x.strip() for x in val.split(",")]
"|".join(pieces)

'a|b|Yelan'

In [15]:
"yelan" in val

False

In [16]:
"Yelan" in val

True

In [17]:
val.index(",")

1

In [18]:
val.find(":")

-1

In [19]:
val.find("::")

-1

In [22]:
val.index(":") # index raises an exception if string isn't found
               # (versus retruning -1)

ValueError: substring not found

In [21]:
val.index("::")

ValueError: substring not found

In [None]:
val.index("|")               # index raises an exception if string isn't found

ValueError: substring not found

In [None]:
val.count(",")

2

In [None]:
val.replace(",","::")

'a:: b:: Yelan'

In [None]:
val.replace(",", "")

'a b Yelan'

In [None]:
import re

text = "mona sucrose\t citlali \tganyu"
re.split(r"\s+", text)


['mona', 'sucrose', 'citlali', 'ganyu']

In [None]:
regex = re.compile(r"\s+")

regex.split(text)

['mona', 'sucrose', 'citlali', 'ganyu']

In [None]:
regex.findall(text)

[' ', '\t ', ' \t']

In [None]:
text1 = """angga tinggarta.ahmad@gmail.com
ning mad_indra@proton.me
lisa lisa_monds@tutamail.com
navia anggabazar_st@yahoo.co.id"""

pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [None]:
regex.findall(text1)

[' ', '\n', ' ', '\n', ' ', '\n', ' ']

In [None]:
regex1 = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
regex1.findall(text1)

['tinggarta.ahmad@gmail.com',
 'mad_indra@proton.me',
 'lisa_monds@tutamail.com',
 'anggabazar_st@yahoo.co.id']

In [None]:
m = regex1.search(text1)
m

<re.Match object; span=(6, 31), match='tinggarta.ahmad@gmail.com'>

In [None]:
text[m.start():m.end()]

'ucrose\t citlali \tganyu'

In [None]:
text1[m.start():m.end()]

'tinggarta.ahmad@gmail.com'

In [None]:
print(regex1.sub("REDACTED", text1))
# regex1.sub replaces all matches of the pattern 
# in text1 with "REDACTED"

# This is useful for redacting sensitive information 
# like email addresses.


angga REDACTED
ning REDACTED
lisa REDACTED
navia REDACTED


In [None]:
print(regex1.match(text1))

None


In [None]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
regex2 = re.compile(pattern, flags=re.IGNORECASE)

m1 = regex2.match("angga@ning.keqing")
m1.groups()

# {2.4} means 2 to 4 characters
# {2,4} means 2 or more characters, up to 4 characters

('angga', 'ning', 'keqi')

In [None]:
regex2.findall(text1)

[('tinggarta.ahmad', 'gmail', 'com'),
 ('mad_indra', 'proton', 'me'),
 ('lisa_monds', 'tutamail', 'com'),
 ('anggabazar_st', 'yahoo.co', 'id')]

In [None]:
print(regex2.sub(r"Username: \1, Domain: \2, TLD: \3", text1))

angga Username: tinggarta.ahmad, Domain: gmail, TLD: com
ning Username: mad_indra, Domain: proton, TLD: me
lisa Username: lisa_monds, Domain: tutamail, TLD: com
navia Username: anggabazar_st, Domain: yahoo.co, TLD: id


In [None]:
# String FUnctions in Pandas
import pandas as pd
import numpy as np
# Create a DataFrame with a column of email addresses
data = {"Angga": "tinggarta.ahmad@gmail.com", 
        "Ning": "mad_indra@proton.me",
        "Lisa": "lisa_monds@tutamail.com",
        "Yelan": np.nan }

data = pd.Series(data)

In [None]:
data 

Angga    tinggarta.ahmad@gmail.com
Ning           mad_indra@proton.me
Lisa       lisa_monds@tutamail.com
Yelan                          NaN
dtype: object

In [None]:
data.isna()
# String Functions in Pandas
# Pandas provides a rich set of string functions for Series objects.

Angga    False
Ning     False
Lisa     False
Yelan     True
dtype: bool

In [None]:
data.notna()

Angga     True
Ning      True
Lisa      True
Yelan    False
dtype: bool

In [None]:
data.str.lower()

Angga    tinggarta.ahmad@gmail.com
Ning           mad_indra@proton.me
Lisa       lisa_monds@tutamail.com
Yelan                          NaN
dtype: object

In [None]:
data.str.contains("gmail")

Angga     True
Ning     False
Lisa     False
Yelan      NaN
dtype: object

In [None]:
data_as_str_ext = data.astype(str)

data_as_str_ext.str.lower()

Angga    tinggarta.ahmad@gmail.com
Ning           mad_indra@proton.me
Lisa       lisa_monds@tutamail.com
Yelan                          nan
dtype: object

In [None]:
data_as_str_ext  # Convert the Series to string type to avoid NaN issues

Angga    tinggarta.ahmad@gmail.com
Ning           mad_indra@proton.me
Lisa       lisa_monds@tutamail.com
Yelan                          nan
dtype: object

In [None]:
data_as_str_ext = data.astype('string')
data_as_str_ext

Angga    tinggarta.ahmad@gmail.com
Ning           mad_indra@proton.me
Lisa       lisa_monds@tutamail.com
Yelan                         <NA>
dtype: string

In [None]:
data_as_str_ext.str.contains("gmail")

Angga     True
Ning     False
Lisa     False
Yelan     <NA>
dtype: boolean

In [None]:

data.str.findall(pattern, flags=re.IGNORECASE)

Angga    [(tinggarta.ahmad, gmail, com)]
Ning           [(mad_indra, proton, me)]
Lisa       [(lisa_monds, tutamail, com)]
Yelan                                NaN
dtype: object

In [None]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]

matches

Angga    (tinggarta.ahmad, gmail, com)
Ning           (mad_indra, proton, me)
Lisa       (lisa_monds, tutamail, com)
Yelan                              NaN
dtype: object

In [None]:
matches.str.get(1)
# matches.str.get(0)

Angga       gmail
Ning       proton
Lisa     tutamail
Yelan         NaN
dtype: object

In [None]:
data.str[:5]

Angga    tingg
Ning     mad_i
Lisa     lisa_
Yelan      NaN
dtype: object

In [None]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Angga,tinggarta.ahmad,gmail,com
Ning,mad_indra,proton,me
Lisa,lisa_monds,tutamail,com
Yelan,,,


In [None]:
# cat -- The 'cat' command is not applicable 
#        in this context as it is a shell command. Linux and Unix systems use 'cat' to 
#        concatenate and display file contents.
#        In Python, you can use built-in functions like 'open' to read files or

# contains -- returns a boolean Series indicating whether 
#             each string contains the specified substring.
#             In Pandas, you can use the 'str.contains' method 
#             on a Series to

# count -- The 'count' method in Pandas counts 
#           the number of occurrences 
#           of a substring in each string of a Series.
#           It returns a Series of integers.
#           In Python, you can use the 'count' 
#           method on a string object.

# extract -- The 'extract' method in Pandas
#            extracts substrings from each string in a Series   
#            based on a regular expression pattern.
#            It returns a DataFrame with the extracted substrings.
#            In Python, you can use the 're' module 
#            to extract substrings

