## **L'encodage**

In [1]:
# fixer le codage des caractères dans l'entête de script
# -*- coding: utf-8 -*-

# L'encodage - afficher tous les codages qui existes
# Importer les bibliothèques sys et encodings
import sys
import encodings

# get encoding sets
codages = encodings.aliases.aliases.values()

# éliminer les doubles
codages = set(codages)

# trier les éléments de la liste
codages = sorted(codages)

# commande join permet de mettre un séparateur entre les éléments 
print("\n".join(codages))

ascii
base64_codec
big5
big5hkscs
bz2_codec
cp037
cp1026
cp1125
cp1140
cp1250
cp1251
cp1252
cp1253
cp1254
cp1255
cp1256
cp1257
cp1258
cp273
cp424
cp437
cp500
cp775
cp850
cp852
cp855
cp857
cp858
cp860
cp861
cp862
cp863
cp864
cp865
cp866
cp869
cp932
cp949
cp950
euc_jis_2004
euc_jisx0213
euc_jp
euc_kr
gb18030
gb2312
gbk
hex_codec
hp_roman8
hz
iso2022_jp
iso2022_jp_1
iso2022_jp_2
iso2022_jp_2004
iso2022_jp_3
iso2022_jp_ext
iso2022_kr
iso8859_10
iso8859_11
iso8859_13
iso8859_14
iso8859_15
iso8859_16
iso8859_2
iso8859_3
iso8859_4
iso8859_5
iso8859_6
iso8859_7
iso8859_8
iso8859_9
johab
koi8_r
kz1048
latin_1
mac_cyrillic
mac_greek
mac_iceland
mac_latin2
mac_roman
mac_turkish
mbcs
ptcp154
quopri_codec
rot_13
shift_jis
shift_jis_2004
shift_jisx0213
tactis
tis_620
utf_16
utf_16_be
utf_16_le
utf_32
utf_32_be
utf_32_le
utf_7
utf_8
uu_codec
zlib_codec


In [2]:
# deux fonctions de Python concernent le codage Unicode des caractères : chr et ord
print(chr(97))
print(chr(237))
print(chr(8364))
print(ord('A'))
print(ord('é'))
print(ord('€'))

a
í
€
65
233
8364


In [3]:
# en Python 3 il existe deux types de chaine de caractère : le type str (unicode) et le type bytes(octets)
ch = 'Bonjour'
print(type(ch)) # str 
bt = b'Bonsoir'
print(type(bt)) # bytes 

<class 'str'>
<class 'bytes'>


In [4]:
# SyntaxError: bytes can only contain ASCII literal characters.
bt_2 = b'مرحبا'

SyntaxError: ignored

In [6]:
# solution 1 – fonction bytes pour convertir le str en bytes
ch_2 = 'مرحبا'
bt_2 = bytes(ch_2, 'utf-8')
print(type(bt_2))

<class 'bytes'>


In [7]:
# solution 2 - encoder str en bytes – les # caractères en bytes sont affichés comme s’ils sont des caractères # encodés en ASCII
before = "Hello €"
after = before.encode("utf-8")
print(after, type(after))

b'Hello \xe2\x82\xac' <class 'bytes'>


In [9]:
# reconvertir le bytes vers le bon encodage str utf-8 – pas d’erreur
print(after.decode("utf-8"))

Hello €


In [10]:
# essayer de décoder bytes avec l’encodage ascii - UnicodeDecodeError
print(after.decode("ascii"))

UnicodeDecodeError: ignored

In [11]:
# exemple 2 - encoder de l’ascii vers bytes en remplaçant les 
# caracatères non ascii en caractères ascii aléatoires
before = "This is the euro symbol: €"
after = before.encode("ascii", errors = "replace")
print(after)
# reconvertir/décoder vers le str utf-8
print(after.decode("ascii")) # perte du caractère € dans la chaine originale, remplacé par le ?, car non représenté dans l’ascii

b'This is the euro symbol: ?'
This is the euro symbol: ?


In [12]:
# afficher des formules chimiques
print("The chemical formula of water is H\u2082O.Water dissociates into H\u207A and OH\u207B")

The chemical formula of water is H₂O.Water dissociates into H⁺ and OH⁻


In [13]:
# afficher les caractères arabe en unicode dans l’intervalle
for i in range(0x0600, 0x06ff):
    print(chr(i))

؀
؁
؂
؃
؄
؅
؆
؇
؈
؉
؊
؋
،
؍
؎
؏
ؐ
ؑ
ؒ
ؓ
ؔ
ؕ
ؖ
ؗ
ؘ
ؙ
ؚ
؛
؜
؝
؞
؟
ؠ
ء
آ
أ
ؤ
إ
ئ
ا
ب
ة
ت
ث
ج
ح
خ
د
ذ
ر
ز
س
ش
ص
ض
ط
ظ
ع
غ
ػ
ؼ
ؽ
ؾ
ؿ
ـ
ف
ق
ك
ل
م
ن
ه
و
ى
ي
ً
ٌ
ٍ
َ
ُ
ِ
ّ
ْ
ٓ
ٔ
ٕ
ٖ
ٗ
٘
ٙ
ٚ
ٛ
ٜ
ٝ
ٞ
ٟ
٠
١
٢
٣
٤
٥
٦
٧
٨
٩
٪
٫
٬
٭
ٮ
ٯ
ٰ
ٱ
ٲ
ٳ
ٴ
ٵ
ٶ
ٷ
ٸ
ٹ
ٺ
ٻ
ټ
ٽ
پ
ٿ
ڀ
ځ
ڂ
ڃ
ڄ
څ
چ
ڇ
ڈ
ډ
ڊ
ڋ
ڌ
ڍ
ڎ
ڏ
ڐ
ڑ
ڒ
ړ
ڔ
ڕ
ږ
ڗ
ژ
ڙ
ښ
ڛ
ڜ
ڝ
ڞ
ڟ
ڠ
ڡ
ڢ
ڣ
ڤ
ڥ
ڦ
ڧ
ڨ
ک
ڪ
ګ
ڬ
ڭ
ڮ
گ
ڰ
ڱ
ڲ
ڳ
ڴ
ڵ
ڶ
ڷ
ڸ
ڹ
ں
ڻ
ڼ
ڽ
ھ
ڿ
ۀ
ہ
ۂ
ۃ
ۄ
ۅ
ۆ
ۇ
ۈ
ۉ
ۊ
ۋ
ی
ۍ
ێ
ۏ
ې
ۑ
ے
ۓ
۔
ە
ۖ
ۗ
ۘ
ۙ
ۚ
ۛ
ۜ
۝
۞
۟
۠
ۡ
ۢ
ۣ
ۤ
ۥ
ۦ
ۧ
ۨ
۩
۪
۫
۬
ۭ
ۮ
ۯ
۰
۱
۲
۳
۴
۵
۶
۷
۸
۹
ۺ
ۻ
ۼ
۽
۾


In [14]:
# afficher les caractères tifinagh en unicode dans l’intervalle
for i in range(0x2d30,0x2d6f):
    print(chr(i))

ⴰ
ⴱ
ⴲ
ⴳ
ⴴ
ⴵ
ⴶ
ⴷ
ⴸ
ⴹ
ⴺ
ⴻ
ⴼ
ⴽ
ⴾ
ⴿ
ⵀ
ⵁ
ⵂ
ⵃ
ⵄ
ⵅ
ⵆ
ⵇ
ⵈ
ⵉ
ⵊ
ⵋ
ⵌ
ⵍ
ⵎ
ⵏ
ⵐ
ⵑ
ⵒ
ⵓ
ⵔ
ⵕ
ⵖ
ⵗ
ⵘ
ⵙ
ⵚ
ⵛ
ⵜ
ⵝ
ⵞ
ⵟ
ⵠ
ⵡ
ⵢ
ⵣ
ⵤ
ⵥ
ⵦ
ⵧ
⵨
⵩
⵪
⵫
⵬
⵭
⵮


In [15]:
# afficher les caractères arabe en unicode avec leurs noms
import unicodedata
for i in range(0x0627,0x06ff):
    try:
        print(i, unicodedata.name(chr(i)))
    except:
        name ="no name"

1575 ARABIC LETTER ALEF
1576 ARABIC LETTER BEH
1577 ARABIC LETTER TEH MARBUTA
1578 ARABIC LETTER TEH
1579 ARABIC LETTER THEH
1580 ARABIC LETTER JEEM
1581 ARABIC LETTER HAH
1582 ARABIC LETTER KHAH
1583 ARABIC LETTER DAL
1584 ARABIC LETTER THAL
1585 ARABIC LETTER REH
1586 ARABIC LETTER ZAIN
1587 ARABIC LETTER SEEN
1588 ARABIC LETTER SHEEN
1589 ARABIC LETTER SAD
1590 ARABIC LETTER DAD
1591 ARABIC LETTER TAH
1592 ARABIC LETTER ZAH
1593 ARABIC LETTER AIN
1594 ARABIC LETTER GHAIN
1595 ARABIC LETTER KEHEH WITH TWO DOTS ABOVE
1596 ARABIC LETTER KEHEH WITH THREE DOTS BELOW
1597 ARABIC LETTER FARSI YEH WITH INVERTED V
1598 ARABIC LETTER FARSI YEH WITH TWO DOTS ABOVE
1599 ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
1600 ARABIC TATWEEL
1601 ARABIC LETTER FEH
1602 ARABIC LETTER QAF
1603 ARABIC LETTER KAF
1604 ARABIC LETTER LAM
1605 ARABIC LETTER MEEM
1606 ARABIC LETTER NOON
1607 ARABIC LETTER HEH
1608 ARABIC LETTER WAW
1609 ARABIC LETTER ALEF MAKSURA
1610 ARABIC LETTER YEH
1611 ARABIC FATHATAN
16

## **Segmentation - Tokenization**

In [16]:
# 1 - Tester la fonction tokenize() simple - Split text into words
def tokenize(text):
    # diviser la ligne par les espaces
    list_word = text.split(" ")
    return list_word

text = "I'm Very Hungry, I want to eat something."
tokens = tokenize(text);
print(tokens)    

["I'm", 'Very', 'Hungry,', 'I', 'want', 'to', 'eat', 'something.']


In [17]:
# 2 - Plusieurs implémentation de la fonction tokenize() en utilisant le module re (regular expression)
import re

# tokenize par des exp-reg simples
def tokenize_regex_punct(text):
    tokens = re.split("[.,:; ]+", text)
    return tokens

# tokenize par des exp-reg simples, en gardant la ponctuation
def tokenize_regex_punct_keep(text):
    tokens = re.split("([.,:; ]+)", text)
    return tokens

# tokenize par des expression régulière
def tokenize_regex(text):
    tokens = re.split("\W+", text)
    return tokens

# tokenize par des expression régulière, en gardant la ponctuation
def tokenize_regex_keep_punct(text):
    tokens = re.split("(\W+)", text)
    return tokens

In [18]:
text = "I'm Very Hungry, I want to eat something. United Kingdom."

In [19]:
tokens = tokenize_regex_punct(text);
print(tokens)

["I'm", 'Very', 'Hungry', 'I', 'want', 'to', 'eat', 'something', 'United', 'Kingdom', '']


In [20]:
tokens = tokenize_regex_punct_keep(text);
print(tokens) 

["I'm", ' ', 'Very', ' ', 'Hungry', ', ', 'I', ' ', 'want', ' ', 'to', ' ', 'eat', ' ', 'something', '. ', 'United', ' ', 'Kingdom', '.', '']


In [21]:
tokens = tokenize_regex(text);
print(tokens) 

['I', 'm', 'Very', 'Hungry', 'I', 'want', 'to', 'eat', 'something', 'United', 'Kingdom', '']


In [22]:
tokens = tokenize_regex_keep_punct(text);
print(tokens)

['I', "'", 'm', ' ', 'Very', ' ', 'Hungry', ', ', 'I', ' ', 'want', ' ', 'to', ' ', 'eat', ' ', 'something', '. ', 'United', ' ', 'Kingdom', '.', '']
