2.1

In [None]:
#You need to split a string into fields, but the delimiters (and spacing around them) aren’t consistent throughout the string.
import re
string = 'This string   ,was meant to be   ,split by a space'
split_str = re.split(r' |,', string)
#returns a list structure once split function is called
print(split_str)

['This', 'string', '', '', '', 'was', 'meant', 'to', 'be', '', '', '', 'split', 'by', 'a', 'space']


2.2

In [None]:
#You need to check the start or end of a string for specific text patterns, such as filename extensions, URL schemes, and so on.

def confirm_txtfile(filepath):
  split_path = re.split(r'/', filepath)
  if split_path[-1].endswith('.txt'):
    return True
  else:
    return False

confirm_txtfile('this/is/my/file/path.csv')

False

2.3

In [None]:
#You want to match text using the same wildcard patterns as are commonly used when working in Unix shells (e.g., .py, Dat[0-9].csv, etc.).
from fnmatch import fnmatch, fnmatchcase
def confirm_txtfile(filepath):
  split_path = re.split(r'/', filepath)
  if fnmatch(split_str[-1], '*.txt'):
    return True
  else:
    return False

confirm_txtfile('this/is/my/file/path.txt')


False

2.4

In [None]:
#You want to match or search text for a specific pattern.

social_media_post = 'This contains naughty nasty words. Doo, Poo, Sludge. This is what I think of that bands performance.'
def check_obscene_lang(post):
  post.lower()
  if post == 'doo' or 'poo' or 'sludge':
    return True
  else:
    False

check_obscene_lang(social_media_post)

True

2.5

In [None]:
#You want to search for and replace a text pattern in a string.
social_media_post = "This contains nasty words. Doo, Poo, Sludge. This is what I think of that bands performance."
clean_post=social_media_post.lower().replace("Doo", "a").replace("Doo", "a").replace("Sludge", "a")
print(clean_post)
def clean_obscene_lang(post):
  clean_post = post.lower()
  clean_post = clean_post.replace("doo", " ").replace("poo", " ").replace("sludge", " ")
  return clean_post
print(clean_obscene_lang(social_media_post))

# replace all instances of 'o' with 'a'
new_string = string.replace("r", "e" )
 
print(string)
print(new_string)

this contains nasty words. doo, poo, sludge. this is what i think of that bands performance.
this contains nasty words.  ,  ,  . this is what i think of that bands performance.
grrks FOR grrks
geeks FOR geeks


2.6

In [None]:
#You need to search for and possibly replace text in a case-insensitive manner.
upper_case_text = 'THIS IS ALL UPPER CASE.'
if re.findall('upper', upper_case_text, flags=re.IGNORECASE):
  print(re.sub('UPPER', 'lower', upper_case_text, flags=re.IGNORECASE))


THIS IS ALL lower CASE.


2.7

In [None]:
#You’re trying to match a text pattern using regular expressions, but it is identifying the longest possible matches of a pattern. 
#Instead, you would like to change it to find the shortest possible match.

#find things contained within quotation marks
import re
text = 'This text has multiple " " " " " " " ".'
string_pattern = re.compile(r'\"(.*?)\"')
string_pattern.findall(text)


[' ', ' ', ' ', ' ']

2.8

In [None]:
#You’re trying to match a block of text using a regular expression, but you need the match to span multiple lines.
data = '''/* this is a
               This is line one
               This line 2.
               This is line 3 */
... '''
pattern = re.compile(r'/\*((?:.|\n)*?)\*/', re.DOTALL)
pattern.findall(data)

[' this is a\n               This is line one\n               This line 2.\n               This is line 3 ']

2.9

In [None]:
#You’re working with Unicode strings, but need to make sure that all of the strings have the same underlying representation.
import unicodedata as ud
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1 == s2)
def normalize_unicode(string):
  normalized_str = ud.normalize('NFC', string)
  return normalize_unicode

normalized_s1 = normalize_unicode(s1)
normalized_s2 = normalize_unicode(s2)
print(normalized_s1 == normalized_s2)

False
True


2.10

In [None]:
#You are using regular expressions to process text, but are concerned about the handling of Unicode characters.

pat = arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')
s = 'straße'
pat.match(s)              
pat.match(s.upper())      
s.upper()                





'STRASSE'

2.11

In [None]:
#You want to strip unwanted characters, such as whitespace, from the beginning, end, or middle of a text string.
str_data = '    --This string is dirty.  = -- '
def clean_string(string):
  string = string.strip()
  string = string.replace('-', '').replace('=', '')
  return string

print(clean_string(str_data))



This string is dirty.   


2.12

In [None]:
#Some bored script kiddie has entered the text “pýtĥöñ” into a form on your web page and you’d like to clean it up somehow.
import unicodedata
import sys
data = s = 'pýtĥöñ\fis\tawesome\r\n'

def clean_unicode(data):
  remap = {ord('\t') : ' ', ord('\f') : ' ', ord('\r') : None, ord('\n'): ' ', }
  data_mapped = data.translate(remap)
  cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
  data_normalized2 = unicodedata.normalize('NFD', data_mapped)
  data_translated = data_normalized2.translate(cmb_chrs)
  return data_translated

print(clean_unicode(data))

python is awesome 


2.13

In [37]:
#You need to format text with some sort of alignment applied.
text = 'This text will be centered.'
text.ljust(10)
text.rjust(10)
text.center(1000, '*')
print(text)

This text will be centered.


2.14

In [39]:
#You want to combine many small strings together into a larger string.
split_parts = ['Is', 'Chicago', 'Not', 'Chicago?']
together = ' '.join(split_parts)
print(together)

Is Chicago Not Chicago?


2.15

In [41]:
#You want to create a string in which embedded variable names are substituted with a string representation of a variable’s value.
data = 'This {blank1} will be {blank2}.'
print(data.format(blank1='string', blank2='embedded'))

This string will be embedded.


2.16

In [46]:
#You have long strings that you want to reformat so that they fill a user-specified number of columns.
import textwrap
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
print(textwrap.fill(s, 30))

Look into my eyes, look into
my eyes, the eyes, the eyes,
the eyes, not around the eyes,
don't look around the eyes,
look into my eyes, you're
under.


2.17

In [47]:
#ou want to replace HTML or XML entities such as &entity; or &#code; with their corresponding text. Alternatively, you need to produce text, but escape certain characters (e.g., <, >, or &).
import html
s = 'Elements are written as "<tag>text</tag>".'
print(html.escape(s))

Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.


2.18

In [57]:
#You have a string that you want to parse left to right into a stream of tokens.
#!pip install nltk
#nltk.download('punkt')
import nltk 
sentence = 'This sentence will be tokenized.'
tokenized_sentence = nltk. word_tokenize(sentence)
print(tokenized_sentence)

['This', 'sentence', 'will', 'be', 'tokenized', '.']


2.19

In [61]:
#You need to parse text according to a set of grammar rules and perform actions or build an abstract syntax tree representing the input. The grammar is small, so you’d prefer to just write the parser yourself as opposed to using some kind of framework.

import re
import collections

# Token specification
NUM    = r'(?P<NUM>\d+)'
PLUS   = r'(?P<PLUS>\+)'
MINUS  = r'(?P<MINUS>-)'
TIMES  = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS     = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
DIVIDE, LPAREN, RPAREN, WS]))

# Tokenizer
Token = collections.namedtuple('Token', ['type','value'])

def generate_tokens(text):
  scanner = master_pat.scanner(text)
  for m in iter(scanner.match, None):
    tok = Token(m.lastgroup, m.group())
    if tok.type != 'WS':
      yield tok

# Parser
class ExpressionEvaluator:
  '''
  Implementation of a recursive descent parser.   Each method
  implements a single grammar rule.  Use the ._accept() method
  to test and accept the current lookahead token.  Use the ._expect()
  method to exactly match and discard the next token on the input
  (or raise a SyntaxError if it doesn't match).
  '''

  def parse(self,text):
      self.tokens = generate_tokens(text)
      self.tok = None             # Last symbol consumed
      self.nexttok = None         # Next symbol tokenized
      self._advance()             # Load first lookahead token
      return self.expr()

  def _advance(self):
      'Advance one token ahead'
      self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

  def _accept(self,toktype):
      'Test and consume the next token if it matches toktype'
      if self.nexttok and self.nexttok.type == toktype:
          self._advance()
          return True
      else:
          return False

  def _expect(self,toktype):
      'Consume next token if it matches toktype or raise SyntaxError'
      if not self._accept(toktype):
          raise SyntaxError('Expected ' + toktype)

  # Grammar rules follow

  def expr(self):
      "expression ::= term { ('+'|'-') term }*"

      exprval = self.term()
      while self._accept('PLUS') or self._accept('MINUS'):
          op = self.tok.type
          right = self.term()
          if op == 'PLUS':
              exprval += right
          elif op == 'MINUS':
              exprval -= right
      return exprval

  def term(self):
      "term ::= factor { ('*'|'/') factor }*"

      termval = self.factor()
      while self._accept('TIMES') or self._accept('DIVIDE'):
          op = self.tok.type
          right = self.factor()
          if op == 'TIMES':
              termval *= right
          elif op == 'DIVIDE':
              termval /= right
      return termval

  def factor(self):
      "factor ::= NUM | ( expr )"

      if self._accept('NUM'):
          return int(self.tok.value)
      elif self._accept('LPAREN'):
          exprval = self.expr()
          self._expect('RPAREN')
          return exprval
      else:
          raise SyntaxError('Expected NUMBER or LPAREN')

express_eval = ExpressionEvaluator()
express_eval.parse('2 * 3')

6

2.20

In [85]:
#You want to perform common text operations (e.g., stripping, searching, and replacement) on byte strings.
data = b'String in byte format'
target = b'i'
a = data.decode('ascii')
b = target.decode('ascii')
#stripping
print(data[6:-6]) # wanted to get the two center words used array slicing method

count = 0
#searching
for char in a:
  if char == b:
    count += 1
  
print('found this many', count, 'within string')   
    
#replacing string char
data.replace(b't', b'T')



b' in byte '
found this many 2 within string


b'STring in byTe formaT'