In [372]:
from datetime import datetime
from string import Template

# Introduction to String Manipulation

## String splitting

In [373]:
my_string = "This is my string to test on"

In [374]:
my_string.split(" ", maxsplit=2)

['This', 'is', 'my string to test on']

In [375]:
my_string.rsplit(" ", maxsplit=2)

['This is my string to', 'test', 'on']

In [376]:
my_string.split(" ") # with no max split defined, results returned is max amount of substring possible

['This', 'is', 'my', 'string', 'to', 'test', 'on']

## String splitlines

In [377]:
text = "This string will be split \n in two lines"

In [378]:
text.splitlines()

['This string will be split ', ' in two lines']

## Joining strings

In [379]:
emails = ['servin@gmail.com','camila@gmail.com','baker@gmail.com','bella@gmail.com']

In [380]:
formatted_emails = ";".join(emails)
formatted_emails

'servin@gmail.com;camila@gmail.com;baker@gmail.com;bella@gmail.com'

## Stripping Characters (from left and/or right)

In [381]:
string1 = " This is a string with white space \n at the end \n"

In [382]:
string1

' This is a string with white space \n at the end \n'

In [383]:
string1.strip()

'This is a string with white space \n at the end'

In [384]:
string1.lstrip()

'This is a string with white space \n at the end \n'

# Finding and Replacing

## string find method

In [385]:
my_string

'This is my string to test on'

In [386]:
my_string.find('h') # returns lowest/first index in the string where the substring can be found

1

In [387]:
my_team = 'Jose Baker Camila Bella Baker'

In [388]:
my_team.find('Baker')

5

In [389]:
my_team.find('Joe') # -1 is returned when a substring cannot be found

-1

## Index function

In [390]:
my_team.index('Baker')

5

In [391]:
try:
    my_team.index('Joe')
except ValueError:
    print('Not found')

Not found


## Counting substrings in a string

In [392]:
my_team

'Jose Baker Camila Bella Baker'

In [393]:
my_team.count('Baker')

2

## Replacing substrings

In [394]:
my_team.replace('Baker','GoodBoy', 2)

'Jose GoodBoy Camila Bella GoodBoy'

In [395]:
my_team.replace('Baker','GoodBoy', 1)

'Jose GoodBoy Camila Bella Baker'

# String Formatting

## Positional Formatting

In [396]:
print("I studied at {} and learned {}.".format("Texas A&M",'ID'))

I studied at Texas A&M and learned ID.


In [397]:
school  = "Texas A&M"
major = "ID"
year = 2019
print("I studied at {} and majored in {}. ".format(school, major))

I studied at Texas A&M and majored in ID. 


In [398]:
print("I studied at {1} and majored in {0}. ".format(school, major))

I studied at ID and majored in Texas A&M. 


In [399]:
print("I studied at {location} and majored in {program}. ".format(location=school, program= major))

I studied at Texas A&M and majored in ID. 


In [400]:
my_data_dict = {
    'name':'Jose',
    'age':25
}
print(" My name is {data[name]}. I am {data[age]} years old".format(data=my_data_dict))

 My name is Jose. I am 25 years old


## Format specifier

In [401]:
print("Only {0:f}% of the {1} produced {2} is analyzed".format(.89348, 'data','in the world'))

Only 0.893480% of the data produced in the world is analyzed


In [402]:
car = 'Honda'
rate = 2.65
print("This {0} has an interest rate of {1:f} %".format(car, rate))

This Honda has an interest rate of 2.650000 %


In [403]:
print("This {0} has an interest rate of {1:.2f} %".format(car, rate))

This Honda has an interest rate of 2.65 %


## Formatting datetime

In [404]:
print(datetime.now())

2022-03-09 19:41:07.898792


In [405]:
statement = "Today's date is {:%Y-%m-%d %H:%M}".format(datetime.now())
print(statement)

Today's date is 2022-03-09 19:41


# Formatted String Literal

## f-strings

In [406]:
name = 'Jose Servin'
book = 'Learn Python'
print(f"My name is {name}")

My name is Jose Servin


In [407]:
print(f"My favorite book is {book!r}")

My favorite book is 'Learn Python'


In [408]:
number = 34.123
print(f"My number is {number:.2f}")

My number is 34.12


In [409]:
now = datetime.now()
print(f"Today is{now: %B %d, %Y}")

Today is March 09, 2022


## index lookup using f-strings

In [410]:
my_data_dict = {
    'name':'Jose',
    'age':25
}
print(f"Is your name {my_data_dict['name']} ?")

Is your name Jose ?


## f-string calling functions

In [411]:
def quick_add(num1, num2):
    return num1 + num2

In [412]:
print(f"10 plus 10 is equal to {quick_add(10, 10)}")

10 plus 10 is equal to 20


# Template Method

In [413]:
my_template = Template("Data Science is $identifier")
my_template.substitute(identifier='a great resource!')

'Data Science is a great resource!'

In [414]:
name = 'Jose Servin'
yrs_old = 25
about_me = Template('Howdy! My name is $name. I am $age years old.')
about_me.substitute(name=name, age=yrs_old)

'Howdy! My name is Jose Servin. I am 25 years old.'

## Template substitution

In [415]:
my_statement = Template("I find DataCamp very ${noun}ing but my sister has lost $noun.")
my_statement.substitute(noun='interest')

'I find DataCamp very interesting but my sister has lost interest.'

## Template substitution dollar amount

In [416]:
my_statement = Template("I paid $$$amount")
my_statement.substitute(amount=12)

'I paid $12'

## Safe substitution

In [417]:
favorite = dict(flavor='vanilla')
my_statement = Template('My favorite cake is $flavor and $other.')
my_statement.safe_substitute(favorite)

'My favorite cake is vanilla and $other.'

# Regular Expressions

1. Definition: string containing a combination of normal characters and special metacharacters that describes patterns to find text or positions within a text.
2. Pattern:

In [418]:
my_regex = r'st\d\s\w{3,10}'
# \d digit
# \s whitespace
# \w word character
# {3,10} is for the character immediately to the left and is telling us that character should appear 3 to 10 times.

## Using the re module

In [419]:
import re

### find all matches of a pattern

In [420]:
my_string = "Love #movies! I had fun yesterday going to the #movies"
my_regex = r'#movies'


In [421]:
re.findall(my_regex, my_string)

['#movies', '#movies']

### Splitting a string at each match

In [422]:
my_string = "Yesterday! I went to the movies! And had lots of fun!"
my_regex = r'!'

In [423]:
re.split(my_regex, my_string)

['Yesterday', ' I went to the movies', ' And had lots of fun', '']

### Replace one or many matches with a string

In [424]:
my_regex = r'Yesterday'
re.sub(my_regex, "Two days ago", my_string)

'Two days ago! I went to the movies! And had lots of fun!'

## Supported metacharacters

### digit

In [425]:
winners = "The winners are, User1, User2, User5, User6 and UserA"
my_regex = r'User\d'
re.findall(my_regex, winners)

['User1', 'User2', 'User5', 'User6']

### Non-digit search

In [426]:
winners = "The winners are, User1, User2, User5, User6 and UserA"
my_regex = r'User\D'
re.findall(my_regex, winners)

['UserA']

## word search returns all numerical and alphabet returns

In [427]:
winners = "The winners are, User1, User2, User5, User6 and UserA"
my_regex = r'User\w'
re.findall(my_regex, winners)

['User1', 'User2', 'User5', 'User6', 'UserA']

### non-word search (finding characters)

In [428]:
winners = "The winners are, User1, User2, User5, User6 and UserA! They have won $450"
my_regex = r'\W\d' # a special character followed by a digit
re.findall(my_regex, winners)

['$4']

In [429]:
my_statement = "I live in Houston Texas"
my_regex = r"Houston\sTexas"
re.findall(my_regex, my_statement)

['Houston Texas']

In [430]:
my_statement = "She is eating ice-cream"
my_regex = r"ice\Scream"
re.findall(my_regex, my_statement)

['ice-cream']

## Matching repeated characters

In [431]:
password = "password1234"
re.search(r"\w\w\w\w\w\w\w\w\d\d\d\d", password)

<re.Match object; span=(0, 12), match='password1234'>

## Matching repeated characters using quantifiers

In [432]:
re.search(r"\w{8}\d{4}", password)

<re.Match object; span=(0, 12), match='password1234'>

In [433]:
my_statement = "Date of start: 4-3. Date of registration 10-04"
my_regex = r"\d+-\d+"
re.findall(my_regex, my_statement)

['4-3', '10-04']

In [434]:
users = "@ameli!a, @john&&n, @mary90"
my_regex = r"@\w+\W*\w+"
re.findall(my_regex, users)

['@ameli!a', '@john&&n', '@mary90']

In [435]:
my_statement = "The color of this is blue. The other colour is red. "
my_regex = r"colou?r"
re.findall(my_regex, my_statement)

['color', 'colour']

In [436]:
phone_numbers = "Baker: 011-800-45-12345, Camila: 12-832-000-0000"
my_regex = r"\d{1,2}-\d{3}-\d{2,3}-\d{4,}"
re.findall(my_regex, phone_numbers)

['11-800-45-12345', '12-832-000-0000']

## Regex Metacharacters

## Finding a match using regex

In [437]:
my_statement = "4506 people attended the show"
my_regex = r'\d{4}'
re.search(my_regex, my_statement)

<re.Match object; span=(0, 4), match='4506'>

## Regex match is anchored to the first character of a string

In [438]:
re.match(my_regex, my_statement)

<re.Match object; span=(0, 4), match='4506'>

In [439]:
my_statement = "Yesterday, I saw 3 movies. "
my_regex = r'\d+'
re.search(my_regex, my_statement)

<re.Match object; span=(17, 18), match='3'>

In [440]:
re.match(my_regex, my_statement)

## Finding links

In [441]:
my_statement = "Just check out this link: www.google.com to find out more"
my_regex = r"www.+com"
re.findall(my_regex, my_statement)

['www.google.com']

## Finding characters located at the start of a string

In [442]:
my_statement = "the 80s and the 90s were years ago"
my_regex = r"the\s\d+s"
re.findall(my_regex, my_statement)

['the 80s', 'the 90s']

## Using the anchor character

In [445]:
my_statement = "the 80s and the 90s were years ago"
my_regex = r"^the\s\d+s"
re.findall(my_regex, my_statement)

['the 80s']

In [448]:
my_statement = "the 80s and the 90s"
my_regex = r"the\s\d+s$"
re.findall(my_regex, my_statement)

['the 90s']

## Special escape character to find Mr.Go.

In [452]:
my_statement = "I love music from Mr.Go. however the sound was not good"
my_regex = r".\s"
re.split(my_regex, my_statement)

['', 'lov', 'musi', 'fro', 'Mr.Go', 'howeve', 'th', 'soun', 'wa', 'no', 'good']

In [453]:
my_statement = "I love music from Mr.Go. however the sound was not good"
my_regex = r"\.\s"
re.split(my_regex, my_statement)

['I love music from Mr.Go', 'however the sound was not good']

## OR regex operator

In [456]:
my_statement = "Elephants are big and i hope to see an elephant one day"
my_regex = r"Elephant|elephant"
re.findall(my_regex, my_statement)

['Elephant', 'elephant']

## Finding full alphabet patterns

In [457]:
my_statement = "The users are: jose3, Baker1, Cami4"
my_regex = r"[a-zA-Z]+\d"
re.findall(my_regex, my_statement)

['jose3', 'Baker1', 'Cami4']

## Replacing non-word characters

In [462]:
my_statement = "My st@tement is fu##y of spec%ial charact#ers."
my_regex = r"[@#%]"
re.sub(my_regex, ' ',  my_statement)

'My st tement is fu  y of spec ial charact ers.'

## Negative regex condition search

In [464]:
my_statement = "Some of the links are www.313.com and www.google.com"
my_regex = r"www[^0-9]+com"
#re.findall(my_regex, ' ',  my_statement)

# Greedy vs Non-greedy matching