#### Cadenas Unicode de Python 3

In [3]:
def unicode_test(value):
    import unicodedata
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print('value="%s", name="%s", value2="%s"' % (value, name, value2))

In [4]:
unicode_test('A')

value="A", name="LATIN CAPITAL LETTER A", value2="A"


In [5]:
unicode_test('$')

value="$", name="DOLLAR SIGN", value2="$"


In [6]:
unicode_test('\u00a2')

value="¢", name="CENT SIGN", value2="¢"


In [7]:
unicode_test('\u20ac')

value="€", name="EURO SIGN", value2="€"


In [8]:
unicode_test('\u2603')

value="☃", name="SNOWMAN", value2="☃"


In [9]:
place = 'café'

In [10]:
place

'café'

In [13]:
import unicodedata

In [14]:
unicodedata.name('\u00e9')

'LATIN SMALL LETTER E WITH ACUTE'

In [15]:
unicodedata.lookup('E WITH ACUTE, LATIN SMALL LETTER')

KeyError: "undefined character name 'E WITH ACUTE, LATIN SMALL LETTER'"

In [16]:
unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')

'é'

In [17]:
place = 'caf\u00e9'

In [18]:
place

'café'

In [19]:
place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}'

In [21]:
place

'café'

In [22]:
u_umlaut = '\N{LATIN SMALL LETTER U WITH DIAERESIS}'

In [23]:
u_umlaut

'ü'

In [24]:
drink = 'Gew' + u_umlaut + 'rztraminer'

In [25]:
print('Now I can finally have my', drink, 'in a', place)

Now I can finally have my Gewürztraminer in a café


In [26]:
len('$')

1

In [27]:
len('\U0001f47b')

1

In [28]:
chr(233)

'é'

In [29]:
chr(0xe9)

'é'

In [30]:
chr(0x1fc6)

'ῆ'

#### Codificar

In [31]:
snowman = '\u2603'

In [32]:
len(snowman)

1

In [33]:
ds = snowman.encode('utf-8')

In [34]:
len(ds)

3

In [35]:
ds

b'\xe2\x98\x83'

In [36]:
ds = snowman.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode character '\u2603' in position 0: ordinal not in range(128)

In [37]:
snowman.encode('ascii', 'ignore')

b''

In [38]:
snowman.encode('ascii', 'replace')

b'?'

In [39]:
snowman.encode('ascii', 'backslashreplace')

b'\\u2603'

In [40]:
snowman.encode('ascii', 'xmlcharrefreplace')

b'&#9731;'

### Decodificar

In [1]:
place = 'caf\u00e9'

In [2]:
place

'café'

In [3]:
type(place)

str

In [4]:
place_bytes = place.encode('utf-8')

In [5]:
place_bytes

b'caf\xc3\xa9'

In [6]:
type(place_bytes)

bytes

In [7]:
place2 = place_bytes.decode('utf-8')

In [8]:
place2

'café'

In [9]:
place2 = place_bytes.decode('ascii')

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128)

In [10]:
place4 = place_bytes.decode('latin-1')

In [11]:
place4

'cafÃ©'

In [12]:
place5 = place_bytes.decode('windows-1252')

In [13]:
place5

'cafÃ©'

#### Entidades HTML

In [14]:
import html

In [15]:
html.unescape('&egrave;')

'è'

In [16]:
html.unescape('&#233;')

'é'

In [17]:
html.unescape('&#xe9;')

'é'

In [18]:
from html.entities import html5

In [19]:
html5['egrave']

'è'

In [20]:
html5['egrave;']

'è'

In [21]:
char = '\u00e9'

In [22]:
dec_value = ord(char)

In [24]:
html.entities.codepoint2name[dec_value]

'eacute'

In [25]:
place = 'caf\u00e9'

In [26]:
byte_value = place.encode('ascii', 'xmlcharrefreplace')

In [27]:
byte_value

b'caf&#233;'

In [28]:
byte_value.decode()

'caf&#233;'

#### Normalización

In [1]:
eacute1 = 'é'

In [2]:
eacute2 = '\u00e9'

In [3]:
eacute3 = '\N{LATIN SMALL LETTER E WITH ACUTE}'

In [4]:
eacute4 = chr(233)

In [5]:
eacute5 = chr(0xe9)

In [6]:
eacute1, eacute2, eacute3, eacute4, eacute5

('é', 'é', 'é', 'é', 'é')

In [7]:
eacute1 == eacute2 == eacute3 == eacute4 == eacute5

True

In [8]:
import unicodedata

In [9]:
unicodedata.name(eacute1)

'LATIN SMALL LETTER E WITH ACUTE'

In [10]:
ord(eacute1)

233

In [11]:
0xe9

233

In [12]:
eacute_combined1 = "e\u0301"

In [13]:
eacute_combined2 = "e\N{COMBINING ACUTE ACCENT}"

In [14]:
eacute_combined3 = "e" + "\u0301"

In [15]:
eacute_combined1, eacute_combined2, eacute_combined3

('é', 'é', 'é')

In [16]:
eacute_combined1 == eacute_combined2 == eacute_combined3

True

In [17]:
len(eacute_combined1)

2

In [18]:
eacute1 == eacute_combined1

False

In [19]:
eacute_normalized = unicodedata.normalize('NFC', eacute_combined1)

In [20]:
len(eacute_normalized)

1

In [21]:
eacute_normalized == eacute1

True

In [22]:
unicodedata.name(eacute_normalized)

'LATIN SMALL LETTER E WITH ACUTE'