## 4.1 문자 문제

In [1]:
s = 'café'
len(s)

4

In [2]:
b = s.encode('utf-8')
b

b'caf\xc3\xa9'

In [3]:
len(b)

5

In [4]:
b.decode('utf-8')

'café'

---

## 4.2 바이트에 대한 기본 지식

In [5]:
cafe = bytes('café', encoding = 'utf-8-sig')
cafe

b'\xef\xbb\xbfcaf\xc3\xa9'

In [6]:
cafe[0]

239

In [7]:
cafe[:1]

b'\xef'

In [8]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'\xef\xbb\xbfcaf\xc3\xa9')

In [9]:
cafe_arr[-1:]

bytearray(b'\xa9')

---

In [10]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])

octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

---

## 4.3 기본 인코더

In [13]:
for codec in ['latin_1', 'utf_8', 'utf_16', 'utf_8_sig']:
    print(codec, 'caffè latte'.encode(codec), sep='\t')

latin_1	b'caff\xe8 latte'
utf_8	b'caff\xc3\xa8 latte'
utf_16	b'\xff\xfec\x00a\x00f\x00f\x00\xe8\x00 \x00l\x00a\x00t\x00t\x00e\x00'
utf_8_sig	b'\xef\xbb\xbfcaff\xc3\xa8 latte'


---

## 4.4 인코딩 / 디코딩 문제 이해하기

In [14]:
coffee = "“Herr Voß: • ½ cup of Œtker™ caffè latte"
coffee.encode('utf-8')

b'\xe2\x80\x9cHerr Vo\xc3\x9f: \xe2\x80\xa2 \xc2\xbd cup of \xc5\x92tker\xe2\x84\xa2 caff\xc3\xa8 latte'

In [15]:
coffee.encode('utf-16')

b'\xff\xfe\x1c H\x00e\x00r\x00r\x00 \x00V\x00o\x00\xdf\x00:\x00 \x00"  \x00\xbd\x00 \x00c\x00u\x00p\x00 \x00o\x00f\x00 \x00R\x01t\x00k\x00e\x00r\x00"! \x00c\x00a\x00f\x00f\x00\xe8\x00 \x00l\x00a\x00t\x00t\x00e\x00'

In [16]:
coffee.encode('utf-8-sig')

b'\xef\xbb\xbf\xe2\x80\x9cHerr Vo\xc3\x9f: \xe2\x80\xa2 \xc2\xbd cup of \xc5\x92tker\xe2\x84\xa2 caff\xc3\xa8 latte'

In [17]:
coffee.encode('cp437', errors='replace')

b'?Herr Vo\xe1: ? \xab cup of ?tker? caff\x8a latte'

In [20]:
list(coffee.encode('utf-8-sig'))[:10]

[239, 187, 191, 226, 128, 156, 72, 101, 114, 114]

---

## 4.6 제대로 비교하기 위해 유니코드 정규화하기

In [21]:
s1 = "café"
s2 = 'cafe\u0301'
s1, s2

('café', 'café')

In [22]:
len(s1), len(s2)

(4, 5)

In [23]:
s1 == s2

False

---

In [24]:
from unicodedata import normalize

s1 = "café"
s2 = 'cafe\u0301'
len(s1), len(s2)

(4, 5)

In [26]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [27]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [28]:
normalize('NFC', s1) == normalize('NFC', s2)

True

---

In [29]:
half = "½"
normalize('NFKC', half)

'1⁄2'

---

## 4.7 유니코드 텍스트 정렬하기

In [34]:
import locale
my_locale = locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
print(my_locale)

pt_BR.UTF-8


In [35]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=locale.strxfrm)
print(sorted_fruits)

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


---

## 4.8 유니코드 데이터베이스

In [36]:
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print(f'U+{ord(char):04x}',                       # U+0000 포맷의 코드 포인트
          char.center(6),                             # 길이가 6인 str의 중앙에 놓인 문자
          're_dig' if re_digit.match(char) else '-',  # r'\d' 정규식과 일치하는 문자의 경우 re_dig 표시
          'isdig' if char.isdigit() else '-',         # char.isdigit()가 참이면 isdig 표시
          'isnum' if char.isnumeric() else '-',       # char.isnumeric()이 참이면 isnum 표시
          f'{unicodedata.numeric(char):5.2f}',        # 전체 너비는 5칸이며 소수점 2자리까지 포맷한 숫자값
          unicodedata.name(char),                     # 유니코드 문자명
          sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


---

## 4.9 이중 모드 str 및 bytes API

In [37]:
import re

# 앞의 두 정규식은 str 형이다.
re_numbers_str = re.compile(r'\d+')     
re_words_str = re.compile(r'\w+')

# 마지막 두 정규식은 bytes 형이다.
re_numbers_bytes = re.compile(rb'\d+')  
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"  
            " as 1729 = 1³ + 12³ = 9³ + 10³.")        

# bytes 문자열
text_bytes = text_str.encode('utf_8')

print(f'Text\n  {text_str!r}')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))     
print('  bytes:', re_numbers_bytes.findall(text_bytes)) 
print('Words')
print('  str  :', re_words_str.findall(text_str))       
print('  bytes:', re_words_bytes.findall(text_bytes))   

Text
  'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']
