# TME6 : Classification de lettres manuscrites

## Format des données

In [1]:
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

# old version = python 2
# data = pkl.load(file("ressources/lettres.pkl","rb"))
# new :
with open('TME6_lettres.pkl', 'rb') as f:
    data = pkl.load(f, encoding='latin1')
X = np.array(data.get('letters')) # récupération des données sur les lettres
Y = np.array(data.get('labels')) # récupération des étiquettes associées 

In [2]:
X[0]

array([ 36.214493, 347.719116, 322.088898, 312.230957, 314.851013,
       315.487213, 313.556702, 326.534973, 141.288971, 167.606689,
       199.321594, 217.911087, 226.443298, 235.002472, 252.354492,
       270.045654, 291.665161, 350.934723,  17.892815,  20.281025,
        28.207161,  43.883423,  53.459026])

In [52]:
Y

array(['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b',
       'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c',
       'c', 'c', 'c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'd',
       'd', 'd', 'd', 'd', 'd', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e',
       'e', 'e', 'e', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f',
       'f', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'h',
       'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'i', 'i', 'i',
       'i', 'i', 'i', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'j', 'j',
       'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'k', 'k', 'k', 'k', 'k',
       'k', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'm',
       'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o',
       'o', 'o', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'q',
       'q', 'q', 'q', 'q', 'q', 'q', 'q', 'q', 'q', 'r', 'r', 'r

In [33]:
# affichage d'une lettre
def tracerLettre(let):
    a = -let*np.pi/180; # conversion en rad
    coord = np.array([[0, 0]]); # point initial
    for i in range(len(a)):
        x = np.array([[1, 0]]);
        rot = np.array([[np.cos(a[i]), -np.sin(a[i])],[ np.sin(a[i]),np.cos(a[i])]])
        xr = x.dot(rot) # application de la rotation
        coord = np.vstack((coord,xr+coord[-1,:]))
    plt.figure()
    plt.plot(coord[:,0],coord[:,1])
    plt.savefig("exlettre.png")
    return

In [32]:
tracerLettre(X[0])

## Apprentissage d'un modèle CM (max de vraisemblance)

### 1. Discrétisation

In [5]:
def discretise(X,d):
    intervalle = 360/d
    Xd = []
    for x in X:
        Xd.append(np.floor(x/intervalle))
    return np.array(Xd)

In [6]:
d = 3
Xd = discretise(X,d)
Xd[0]

array([0., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 0., 0., 0., 0., 0.])

### 2. Regrouper les indices des signaux par classe (pour faciliter l'apprentissage)

In [7]:
def groupByLabel(y):
    index = []
    for i in np.unique(y): # pour toutes les classes
        ind, = np.where(y==i)
        index.append(ind)
    return index

In [8]:
indParClasse = groupByLabel(Y)
indParClasse

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
 array([22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]),
 array([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]),
 array([44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]),
 array([55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]),
 array([66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]),
 array([77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87]),
 array([88, 89, 90, 91, 92, 93, 94, 95, 96, 97]),
 array([ 98,  99, 100, 101, 102, 103, 104, 105, 106, 107]),
 array([108, 109, 110, 111, 112, 113, 114, 115, 116, 117]),
 array([118, 119, 120, 121, 122, 123, 124, 125, 126, 127]),
 array([128, 129, 130, 131, 132, 133, 134, 135, 136, 137]),
 array([138, 139, 140, 141, 142, 143, 144, 145, 146, 147]),
 array([148, 149, 150, 151, 152, 153, 154, 155, 156, 157]),
 array([158, 159, 160, 161, 162, 163, 164, 165, 166, 167]),
 array([168, 169, 170, 171, 172, 173, 174, 175, 176, 177]),
 array([178, 179, 180, 181, 182, 183, 

### 3. Apprendre les modèles CM

In [163]:
def learnMarkovModel(Xc, d):
    A = np.ones((d,d))
    Pi = np.ones(d)
    for x in Xc:
        Pi[int(x[0])] += 1
        for i in range(len(x)-1):
            A[int(x[i])][int(x[i+1])] += 1
    A = A/np.maximum(A.sum(1).reshape(d,1),1) # normalisation
    Pi = Pi/Pi.sum()
    return Pi,A

In [50]:
d = 3
Xd = discretise(X,d)
index = groupByLabel(Y)
Xc = Xd[index[0]]
learnMarkovModel(Xc,d)

(array([0.36363636, 0.        , 0.63636364]),
 array([[0.84444444, 0.06666667, 0.08888889],
        [0.        , 0.83333333, 0.16666667],
        [0.11382114, 0.06504065, 0.82113821]]))

### 4. Stocker les modèles dans une liste 

In [35]:
d = 3     # paramètre de discrétisation
Xd = discretise(X,d)  # application de la discrétisation
index = groupByLabel(Y)  # groupement des signaux par classe
models = []
for cl in range(len(np.unique(Y))): # parcours de toutes les classes et optimisation des modèles
    models.append(learnMarkovModel(Xd[index[cl]], d))

## Test (affectation dans les classes sur critère MV)

### 1. (log)Probabilité d'une séquence dans un modèle

In [189]:
def probaSequence(s,Pi,A):
    ll = np.log(Pi[int(s[0])])
    for i in range(len(s)-1):
        ll += np.log(A[int(s[i])][int(s[i+1])])
    return ll

In [190]:
np.array([probaSequence(Xd[0], models[cl][0], models[cl][1]) for cl in range(len(np.unique(Y)))])

array([-51.15111929, -71.49240593, -55.61105131, -54.80391078,
       -47.3983488 , -66.33841105, -66.13185435, -64.76456372,
       -67.38815483, -68.85064341, -65.35853317, -67.18964465,
       -61.58983381, -61.17414325, -55.54049973, -68.76307287,
       -63.64333695, -68.52324453, -62.8097373 , -67.73337739,
       -58.78327693, -70.19722024, -63.65070301, -59.9505533 ,
       -64.1080762 , -63.51830056])

Ce signal n'est pas bien classé.
Les -inf viennent du fait que l'on tente de calculer des log(0) (indéfini).

### 2. Application de la méthode précédente pour tous les signaux et tous les modèles de lettres

In [191]:
proba_3_etats = np.array([[probaSequence(Xd[i], models[cl][0], models[cl][1]) for i in range(len(Xd))]for cl in range(len(np.unique(Y)))])

In [192]:
d = 20     # paramètre de discrétisation
Xd = discretise(X,d)  # application de la discrétisation
index = groupByLabel(Y)  # groupement des signaux par classe
models = []
for cl in range(len(np.unique(Y))): # parcours de toutes les classes et optimisation des modèles
    models.append(learnMarkovModel(Xd[index[cl]], d))

In [193]:
proba_20_etats = np.array([[probaSequence(Xd[i], models[cl][0], models[cl][1]) for i in range(len(Xd))]for cl in range(len(np.unique(Y)))])

### 3. Evaluation des performances

In [194]:
Ynum = np.zeros(Y.shape)
for num,char in enumerate(np.unique(Y)):
    Ynum[Y==char] = num
pred_3_etats = proba_3_etats.argmax(0) # max colonne par colonne
print(np.where(pred_3_etats != Ynum, 0.,1.).mean())
pred_20_etats = proba_20_etats.argmax(0) # max colonne par colonne
print(np.where(pred_20_etats != Ynum, 0.,1.).mean())

0.7649253731343284
0.8395522388059702


## Biais d'évaluation, notion de sur-apprentissage

In [195]:
# separation app/test, pc=ratio de points en apprentissage
def separeTrainTest(y, pc):
    indTrain = []
    indTest = []
    for i in np.unique(y): # pour toutes les classes
        ind, = np.where(y==i)
        n = len(ind)
        indTrain.append(ind[np.random.permutation(n)][:int(np.floor(pc*n))])
        indTest.append(np.setdiff1d(ind, indTrain[-1]))
    return indTrain, indTest
# exemple d'utilisation
itrain,itest = separeTrainTest(Y,0.8)

In [196]:
itrain

[array([ 2,  9,  6,  8,  5,  4,  1, 10]),
 array([18, 20, 21, 14, 12, 17, 13, 19]),
 array([32, 23, 31, 28, 29, 22, 26, 27]),
 array([42, 33, 36, 39, 38, 35, 40, 37]),
 array([51, 50, 46, 54, 48, 52, 49, 44]),
 array([65, 56, 59, 58, 57, 55, 60, 62]),
 array([71, 74, 73, 66, 76, 75, 68, 70]),
 array([80, 83, 87, 86, 77, 81, 79, 82]),
 array([92, 93, 91, 96, 94, 95, 97, 88]),
 array([107, 106, 104, 105,  99, 102, 101,  98]),
 array([117, 116, 110, 109, 114, 113, 112, 111]),
 array([119, 127, 123, 120, 125, 121, 126, 124]),
 array([133, 132, 129, 135, 130, 137, 131, 134]),
 array([139, 141, 147, 143, 142, 145, 140, 146]),
 array([155, 157, 149, 156, 151, 154, 150, 153]),
 array([167, 163, 164, 165, 166, 160, 158, 159]),
 array([174, 173, 170, 175, 177, 176, 169, 172]),
 array([186, 179, 180, 182, 187, 184, 178, 183]),
 array([188, 192, 194, 190, 197, 193, 189, 195]),
 array([204, 200, 206, 207, 199, 202, 198, 201]),
 array([212, 209, 211, 213, 216, 208, 210, 217]),
 array([218, 226, 227,

In [197]:
itest

[array([0, 3, 7]),
 array([11, 15, 16]),
 array([24, 25, 30]),
 array([34, 41, 43]),
 array([45, 47, 53]),
 array([61, 63, 64]),
 array([67, 69, 72]),
 array([78, 84, 85]),
 array([89, 90]),
 array([100, 103]),
 array([108, 115]),
 array([118, 122]),
 array([128, 136]),
 array([138, 144]),
 array([148, 152]),
 array([161, 162]),
 array([168, 171]),
 array([181, 185]),
 array([191, 196]),
 array([203, 205]),
 array([214, 215]),
 array([223, 224]),
 array([233, 235]),
 array([238, 247]),
 array([251, 253]),
 array([262, 267])]

In [198]:
d = 3     # paramètre de discrétisation
Xd = discretise(X,d)  # application de la discrétisation
index = groupByLabel(Y)  # groupement des signaux par classe
models = []
for cl in range(len(np.unique(Y))): # parcours de toutes les classes et optimisation des modèles
    models.append(learnMarkovModel(Xd[itrain[cl]], d))

In [199]:
ia = []
for i in itrain:
    ia += i.tolist()    
it = []
for i in itest:
    it += i.tolist()

In [200]:
proba_3_etats = np.array([[probaSequence(Xd[i], models[cl][0], models[cl][1]) for i in it]for cl in range(len(np.unique(Y)))])

In [201]:
d = 20     # paramètre de discrétisation
Xd = discretise(X,d)  # application de la discrétisation
index = groupByLabel(Y)  # groupement des signaux par classe
models = []
for cl in range(len(np.unique(Y))): # parcours de toutes les classes et optimisation des modèles
    models.append(learnMarkovModel(Xd[itrain[cl]], d))

In [202]:
proba_20_etats = np.array([[probaSequence(Xd[i], models[cl][0], models[cl][1]) for i in it]for cl in range(len(np.unique(Y)))])

In [203]:
pred_3_etats = proba_3_etats.argmax(0)

In [204]:
pred_3_etats

array([25,  3, 25,  1,  1,  1,  4,  2,  4,  3,  0,  0,  4,  4,  4, 16, 16,
       16, 18, 18, 18, 12, 12,  7,  8,  8,  9,  9,  8, 10,  8,  8, 12, 12,
       13, 12,  4, 14, 15, 15, 16, 16, 12, 20, 10, 18,  8,  8, 20, 20, 20,
       20, 22, 22,  8,  8,  3, 24, 25, 25])

In [205]:
pred_20_etats = proba_20_etats.argmax(0)

In [206]:
pred_20_etats

array([ 0, 16,  0, 15,  1,  1,  4, 14,  4,  0,  3,  3,  4, 14,  4, 16, 14,
       16,  6,  6,  2, 12, 12, 22, 17, 13,  7,  9, 12, 12, 12, 11, 12, 12,
       12, 12, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 10, 11, 22, 22, 12,
       21, 22, 22, 23, 23, 24, 24, 25, 25])

In [207]:
for i in range(len(pred_20_etats)):
    print('abcdefghijklmnopqrstuvwxyz'[pred_20_etats[i]],Y[it])

a ['a' 'a' 'a' 'b' 'b' 'b' 'c' 'c' 'c' 'd' 'd' 'd' 'e' 'e' 'e' 'f' 'f' 'f'
 'g' 'g' 'g' 'h' 'h' 'h' 'i' 'i' 'j' 'j' 'k' 'k' 'l' 'l' 'm' 'm' 'n' 'n'
 'o' 'o' 'p' 'p' 'q' 'q' 'r' 'r' 's' 's' 't' 't' 'u' 'u' 'v' 'v' 'w' 'w'
 'x' 'x' 'y' 'y' 'z' 'z']
q ['a' 'a' 'a' 'b' 'b' 'b' 'c' 'c' 'c' 'd' 'd' 'd' 'e' 'e' 'e' 'f' 'f' 'f'
 'g' 'g' 'g' 'h' 'h' 'h' 'i' 'i' 'j' 'j' 'k' 'k' 'l' 'l' 'm' 'm' 'n' 'n'
 'o' 'o' 'p' 'p' 'q' 'q' 'r' 'r' 's' 's' 't' 't' 'u' 'u' 'v' 'v' 'w' 'w'
 'x' 'x' 'y' 'y' 'z' 'z']
a ['a' 'a' 'a' 'b' 'b' 'b' 'c' 'c' 'c' 'd' 'd' 'd' 'e' 'e' 'e' 'f' 'f' 'f'
 'g' 'g' 'g' 'h' 'h' 'h' 'i' 'i' 'j' 'j' 'k' 'k' 'l' 'l' 'm' 'm' 'n' 'n'
 'o' 'o' 'p' 'p' 'q' 'q' 'r' 'r' 's' 's' 't' 't' 'u' 'u' 'v' 'v' 'w' 'w'
 'x' 'x' 'y' 'y' 'z' 'z']
p ['a' 'a' 'a' 'b' 'b' 'b' 'c' 'c' 'c' 'd' 'd' 'd' 'e' 'e' 'e' 'f' 'f' 'f'
 'g' 'g' 'g' 'h' 'h' 'h' 'i' 'i' 'j' 'j' 'k' 'k' 'l' 'l' 'm' 'm' 'n' 'n'
 'o' 'o' 'p' 'p' 'q' 'q' 'r' 'r' 's' 's' 't' 't' 'u' 'u' 'v' 'v' 'w' 'w'
 'x' 'x' 'y' 'y' 'z' 'z']
b ['a' 'a' '

In [208]:
Y[it]

array(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd', 'e',
       'e', 'e', 'f', 'f', 'f', 'g', 'g', 'g', 'h', 'h', 'h', 'i', 'i',
       'j', 'j', 'k', 'k', 'l', 'l', 'm', 'm', 'n', 'n', 'o', 'o', 'p',
       'p', 'q', 'q', 'r', 'r', 's', 's', 't', 't', 'u', 'u', 'v', 'v',
       'w', 'w', 'x', 'x', 'y', 'y', 'z', 'z'], dtype='<U1')

In [209]:
p = 0
for i in range(len(pred_3_etats)):
    if 'abcdefghijklmnopqrstuvwxyz'[pred_3_etats[i]] == Y[it][i]:
        p+=1
print(p/len(pred_3_etats))

0.5


In [210]:
p = 0
for i in range(len(pred_20_etats)):
    if 'abcdefghijklmnopqrstuvwxyz'[pred_20_etats[i]] == Y[it][i]:
        p+=1
print(p/len(pred_20_etats))

0.55


In [211]:
conf = np.zeros((26,26))
for i in range(len(pred_20_etats)):
    conf[pred_20_etats[i]]['abcdefghijklmnopqrstuvwxyz'.index(Y[it][i])] += 1
    
plt.figure()
plt.imshow(conf, interpolation='nearest')
plt.colorbar()
plt.xticks(np.arange(26),np.unique(Y))
plt.yticks(np.arange(26),np.unique(Y))
plt.xlabel(u'Vérité terrain')
plt.ylabel(u'Prédiction')
plt.savefig("mat_conf_lettres.png")