#### Cadenas Unicode de Python 3

In [3]:
def unicode_test(value):
    import unicodedata
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print('value="%s", name="%s", value2="%s"' % (value, name, value2))

In [4]:
unicode_test('A')

value="A", name="LATIN CAPITAL LETTER A", value2="A"


In [5]:
unicode_test('$')

value="$", name="DOLLAR SIGN", value2="$"


In [6]:
unicode_test('\u00a2')

value="¢", name="CENT SIGN", value2="¢"


In [7]:
unicode_test('\u20ac')

value="€", name="EURO SIGN", value2="€"


In [8]:
unicode_test('\u2603')

value="☃", name="SNOWMAN", value2="☃"


In [9]:
place = 'café'

In [10]:
place

'café'

In [13]:
import unicodedata

In [14]:
unicodedata.name('\u00e9')

'LATIN SMALL LETTER E WITH ACUTE'

In [15]:
unicodedata.lookup('E WITH ACUTE, LATIN SMALL LETTER')

KeyError: "undefined character name 'E WITH ACUTE, LATIN SMALL LETTER'"

In [16]:
unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')

'é'

In [17]:
place = 'caf\u00e9'

In [18]:
place

'café'

In [19]:
place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}'

In [21]:
place

'café'

In [22]:
u_umlaut = '\N{LATIN SMALL LETTER U WITH DIAERESIS}'

In [23]:
u_umlaut

'ü'

In [24]:
drink = 'Gew' + u_umlaut + 'rztraminer'

In [25]:
print('Now I can finally have my', drink, 'in a', place)

Now I can finally have my Gewürztraminer in a café


In [26]:
len('$')

1

In [27]:
len('\U0001f47b')

1

In [28]:
chr(233)

'é'

In [29]:
chr(0xe9)

'é'

In [30]:
chr(0x1fc6)

'ῆ'

#### Codificar

In [31]:
snowman = '\u2603'

In [32]:
len(snowman)

1

In [33]:
ds = snowman.encode('utf-8')

In [34]:
len(ds)

3

In [35]:
ds

b'\xe2\x98\x83'

In [36]:
ds = snowman.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode character '\u2603' in position 0: ordinal not in range(128)

In [37]:
snowman.encode('ascii', 'ignore')

b''

In [38]:
snowman.encode('ascii', 'replace')

b'?'

In [39]:
snowman.encode('ascii', 'backslashreplace')

b'\\u2603'

In [40]:
snowman.encode('ascii', 'xmlcharrefreplace')

b'&#9731;'

### Decodificar

In [1]:
place = 'caf\u00e9'

In [2]:
place

'café'

In [3]:
type(place)

str

In [4]:
place_bytes = place.encode('utf-8')

In [5]:
place_bytes

b'caf\xc3\xa9'

In [6]:
type(place_bytes)

bytes

In [7]:
place2 = place_bytes.decode('utf-8')

In [8]:
place2

'café'

In [9]:
place2 = place_bytes.decode('ascii')

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128)

In [10]:
place4 = place_bytes.decode('latin-1')

In [11]:
place4

'cafÃ©'

In [12]:
place5 = place_bytes.decode('windows-1252')

In [13]:
place5

'cafÃ©'

#### Entidades HTML

In [14]:
import html

In [15]:
html.unescape('&egrave;')

'è'

In [16]:
html.unescape('&#233;')

'é'

In [17]:
html.unescape('&#xe9;')

'é'

In [18]:
from html.entities import html5

In [19]:
html5['egrave']

'è'

In [20]:
html5['egrave;']

'è'

In [21]:
char = '\u00e9'

In [22]:
dec_value = ord(char)

In [24]:
html.entities.codepoint2name[dec_value]

'eacute'

In [25]:
place = 'caf\u00e9'

In [26]:
byte_value = place.encode('ascii', 'xmlcharrefreplace')

In [27]:
byte_value

b'caf&#233;'

In [28]:
byte_value.decode()

'caf&#233;'

#### Normalización

In [1]:
eacute1 = 'é'

In [2]:
eacute2 = '\u00e9'

In [3]:
eacute3 = '\N{LATIN SMALL LETTER E WITH ACUTE}'

In [4]:
eacute4 = chr(233)

In [5]:
eacute5 = chr(0xe9)

In [6]:
eacute1, eacute2, eacute3, eacute4, eacute5

('é', 'é', 'é', 'é', 'é')

In [7]:
eacute1 == eacute2 == eacute3 == eacute4 == eacute5

True

In [8]:
import unicodedata

In [9]:
unicodedata.name(eacute1)

'LATIN SMALL LETTER E WITH ACUTE'

In [10]:
ord(eacute1)

233

In [11]:
0xe9

233

In [12]:
eacute_combined1 = "e\u0301"

In [13]:
eacute_combined2 = "e\N{COMBINING ACUTE ACCENT}"

In [14]:
eacute_combined3 = "e" + "\u0301"

In [15]:
eacute_combined1, eacute_combined2, eacute_combined3

('é', 'é', 'é')

In [16]:
eacute_combined1 == eacute_combined2 == eacute_combined3

True

In [17]:
len(eacute_combined1)

2

In [18]:
eacute1 == eacute_combined1

False

In [19]:
eacute_normalized = unicodedata.normalize('NFC', eacute_combined1)

In [20]:
len(eacute_normalized)

1

In [21]:
eacute_normalized == eacute1

True

In [22]:
unicodedata.name(eacute_normalized)

'LATIN SMALL LETTER E WITH ACUTE'

#### Cadenas de texto: expresiones regulares

In [23]:
import re

In [24]:
result = re.match('You', 'Young Frankenstein')

In [25]:
result

<re.Match object; span=(0, 3), match='You'>

In [26]:
youpattern = re.compile('You')

In [27]:
result = youpattern.match('Young Frankenstein')

In [28]:
print(result)

<re.Match object; span=(0, 3), match='You'>


##### Encontrar coincidencia inicial exacta con match()

In [29]:
import re

In [None]:
source = 'Young Frankenstein'

In [31]:
m = re.match('You', source)

In [32]:
if m:
    print(m.group())

You


In [33]:
m = re.match('^You', source)

In [34]:
if m:
    print(m.group())

You


In [35]:
m = re.match('Frank', source)

In [36]:
if m:
    print(m.group())

In [37]:
if m := re.match('Frank', source):
    print(m.group())

In [38]:
if m := re.search('Frank', source):
    print(m.group())

Frank


In [39]:
if m := re.match('.*Frank', source):
    print(m.group())

Young Frank


##### Encontrar la primera coincidencia con search()

In [40]:
import re

In [41]:
source = 'Young Frankenstein'

In [42]:
if m:= re.search('Frank', source):
    print(m.group())

Frank


##### Buscar todas las coincidencias con findall()

In [43]:
import re

In [44]:
source = 'Young Frankenstein'

In [45]:
m = re.findall('n', source)

In [46]:
m

['n', 'n', 'n', 'n']

In [47]:
print('Found', len(m), 'matches')

Found 4 matches


In [48]:
m = re.findall('n.', source)

In [49]:
m

['ng', 'nk', 'ns']

In [50]:
m = re.findall('n.?', source)

In [51]:
m

['ng', 'nk', 'ns', 'n']

##### Dividir en partidos con split()

In [52]:
import re

In [53]:
source = 'Young Frankenstein'

In [54]:
m = re.split('n', source)

In [55]:
m

['You', 'g Fra', 'ke', 'stei', '']

##### Reemplazar en coincidencias con sub()

In [56]:
import re

In [57]:
source = 'Young Frankenstein'

In [58]:
m = re.sub('n', '?', source)

In [59]:
m

'You?g Fra?ke?stei?'

#### Patrones: Carácteres especiales

In [60]:
import string

In [61]:
printable = string.printable

In [62]:
len(printable)

100

In [63]:
printable[0:50]

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN'

In [64]:
printable[50:]

'OPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [65]:
re.findall('\d', printable)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [66]:
re.findall('\w', printable)

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '_']

In [67]:
re.findall('\s', printable)

[' ', '\t', '\n', '\r', '\x0b', '\x0c']

In [68]:
x = 'abc' + '-/*' + '\u00ea' + '\u0115'

In [69]:
x

'abc-/*êĕ'

In [70]:
re.findall('\w', x)

['a', 'b', 'c', 'ê', 'ĕ']

#### Patrones: uso de especificadores

In [10]:
import re

In [11]:
source = '''I wish I may, I wish I might
Have a dish of fish tonight.'''

In [12]:
source

'I wish I may, I wish I might\nHave a dish of fish tonight.'

In [13]:
re.findall('wish', source)

['wish', 'wish']

In [14]:
re.findall('^wish', source)

[]

In [15]:
re.findall('^I wish', source)

['I wish']

In [16]:
re.findall('fish$', source)

[]

In [17]:
re.findall('fish tonight.$', source)

['fish tonight.']

In [18]:
re.findall('[wf]ish', source)

['wish', 'wish', 'fish']

In [19]:
re.findall('[wsh]+', source)

['w', 'sh', 'w', 'sh', 'h', 'sh', 'sh', 'h']

In [20]:
re.findall('I (?=wish)', source)

['I ', 'I ']

In [21]:
re.findall('(?<=I) wish', source)

[' wish', ' wish']

In [22]:
re.findall('\bfish', source)

[]

In [23]:
re.findall(r'\bfish', source)

['fish']

#### Patrones: Especificación de la salida match()

In [29]:
import re

In [30]:
m = re.search(r'(. dish\b).*(\bfish)', source)

In [31]:
m.group()

'a dish of fish'

In [32]:
m.groups()

('a dish', 'fish')

In [33]:
m = re.search(r'(?P<DISH>. dish\b).*(?P<FISH>\bfish)', source)

In [34]:
m.group()

'a dish of fish'

In [35]:
m.groups()

('a dish', 'fish')

In [36]:
m.group('DISH')

'a dish'

In [37]:
m.group('FISH')

'fish'

### Datos binarios

In [7]:
blist = [1, 2, 3, 255]

In [8]:
the_bytes = bytes(blist)

In [9]:
the_bytes

b'\x01\x02\x03\xff'

In [10]:
the_byte_array = bytearray(blist)

In [11]:
the_byte_array

bytearray(b'\x01\x02\x03\xff')

In [12]:
b'\x61'

b'a'

In [13]:
b'\x01abc\xff'

b'\x01abc\xff'

In [14]:
blist = [1, 2, 3, 255]

In [15]:
the_bytes = bytes(blist)

In [16]:
the_bytes[1] = 127

TypeError: 'bytes' object does not support item assignment

In [17]:
blist = [1, 2, 3, 255]

In [18]:
the_byte_array = bytearray(blist)

In [20]:
the_byte_array

bytearray(b'\x01\x02\x03\xff')

In [21]:
the_byte_array[1] = 127

In [22]:
the_byte_array

bytearray(b'\x01\x7f\x03\xff')

In [23]:
the_bytes = bytes(range(0, 256))

In [24]:
the_byte_array = bytearray(range(0, 256))

In [25]:
the_bytes

b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'

#### Convertir datos binarios con estructuras

In [26]:
import struct

In [27]:
valid_png_header = b'\x89PNG\r\n\x1a\n'

In [28]:
data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR' + b'\x00\x00\x00\x9a\x00\x00\x00\x8d\x08\x02\x00\x00\x00\xc0'

In [29]:
if data[:8] == valid_png_header:
    width, height = struct.unpack('>LL', data[16:24])
    print('Valid PNG, width', width, 'height', height)
else:
    print('Not a valid PNG')

Valid PNG, width 154 height 141


In [30]:
data[16:20]

b'\x00\x00\x00\x9a'

In [32]:
data[20:24]

b'\x00\x00\x00\x8d'

In [33]:
0x9a

154

In [34]:
0x8d

141

In [35]:
struct.pack('>L', 154)

b'\x00\x00\x00\x9a'

In [36]:
struct.pack('>L', 141)

b'\x00\x00\x00\x8d'

In [37]:
struct.unpack('>2L', data[16:24])

(154, 141)

In [38]:
struct.unpack('>16x2L6x', data)

(154, 141)