# Decision theory project

## Tasks 3: Supervised classification using evidential KNN (application to plastics sorting problem)

### a. Identify the utility function.

For this question, we keep the same utility function than the Task 2.

In [2]:
import pandas as pd
import numpy as np

u = [[1,0.6,0,0],
     [0.6,1,0,0],
     [0.5,0.5,0.8,0.4],
     [0.5,0.5,0.4,0.8]]

# list of the classes
classe = ['ABS', 'HiPS', 'PE', 'PP']

utility_matrix = pd.DataFrame(u, columns=classe)

In [3]:
utility_matrix.head()

Unnamed: 0,ABS,HiPS,PE,PP
0,1.0,0.6,0.0,0.0
1,0.6,1.0,0.0,0.0
2,0.5,0.5,0.8,0.4
3,0.5,0.5,0.4,0.8


### b. Implement eknn.


In [4]:
# Create train and test datasets
from sklearn.model_selection import train_test_split

data = pd.read_csv("./PlasticsTrain.csv", sep=";")

y = data["class"]
X = data.drop(["class", "line", "column", "object"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

Then, the dist_matrix is created : it permits to know all the Euclidean distances between X_i and X_j

In [5]:
from scipy.spatial.distance import pdist, squareform

# Distances
newDF = pd.concat([X_train, X_test])

# Convertir toutes les colonnes en float
newDF = newDF.replace(',', '.', regex=True)
newDF = newDF.astype(float)

distances = pdist(newDF.values, metric='euclidean')
dist_matrix = squareform(distances)

In [6]:
'''
    fit_gamma permits to get the list of the coefficients [gamma_1, ..., gamma_n]
    coefficients are fitted with the training dataset
'''
def fit_gamma(train_label, dist, classes):
    gamma = []
    nbC = len(classes)
    for i in range(nbC):
        C_indices = [t for t in range(len(train_label)) if (train_label[t] == classes[i])]
        C_dist = []
        for ci in range(len(C_indices)):
            for cj in range(ci + 1, len(C_indices)):
                C_dist.append(dist[C_indices[ci]][C_indices[cj]])
        gamma.append(1 / np.mean(C_dist))
    return gamma

In [7]:
# Observing the result
coeff_gamma = fit_gamma(y_train.values, dist_matrix, classe)
print(coeff_gamma)

[0.38275251219058143, 0.37140315464252055, 0.21394188893720167, 0.19131978610289185]


The parameter phi = [phi_1, ..., phi_n] is calculated thanks to the choice : phi_i(d) = alpha exp(-gamma *d ^ 2).

In [8]:
#we fix alpha = 0.95
def phi(gamma, d, alpha=0.95):
    return [alpha*np.exp(-gamma[i]*d**2) for i in range(len(gamma))]

In [9]:
''' 
    voisins is a function which returns the index of the k nearest neighbours from train dataset of a vector X. 
'''

def voisins(index, train_size, y, k):
    index_sort = np.argsort(dist_matrix[index])
    indice_voisins = []

    iteration = 1
    while len(indice_voisins) < k:
        # only index from the training set
        if(index_sort[iteration]<train_size):
            indice_voisins.append(index_sort[iteration])    
        iteration += 1

    distance_voisins = dist_matrix[index, indice_voisins]
    classes = y[indice_voisins]
    
    return indice_voisins, distance_voisins, classes

In [10]:
'''
    m_j compute the piece of evidence and combine the mass functions mj(.|x) using Dempster s rule
    It returns a list associated with each class
'''

def m_j(index, train_size, y, k):
    #collect the information of the neighbours
    indice_voisins, distance_voisins, classes = voisins(index, train_size, y, k)

    # init coeff_ABS 
    coeff_ABS, coeff_HIPS, coeff_PE, coeff_PP = 1, 1, 1, 1

    # compute the numerator of the coefficients of each classes
    for i in range(len(indice_voisins)) :
        current_class = str(classes.iloc[i])
        if current_class == 'ABS':
            coeff_ABS *= (1- phi(coeff_gamma, distance_voisins[i])[0])
        elif current_class == 'HiPS':
            coeff_HIPS *= (1- phi(coeff_gamma, distance_voisins[i])[1])
        elif current_class == 'PE':
            coeff_PE *= (1- phi(coeff_gamma, distance_voisins[i])[2])
        elif current_class == 'PP':
            coeff_PP *= (1- phi(coeff_gamma, distance_voisins[i])[3])
    
    # calcul the fourth m_i(A|X) = [A=theta_i, A!=theta_i]
    m_ABS=[1-coeff_ABS, coeff_ABS]
    m_HIPS=[1-coeff_HIPS, coeff_HIPS]
    m_PE=[1-coeff_PE, coeff_PE]
    m_PP=[1-coeff_PP, coeff_PP]

    m_j =  np.array([m_ABS, m_HIPS, m_PE, m_PP])

    # normalization coeff to compute m 
    product = np.prod(m_j)
    sum = 0
    for i in range(4):
        sum += m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1]
    n_factor = product + sum

    # calcul m = [m({ABS}), m({HIPS}), m({PE}), m({PP})]
    m = np.zeros(4)
    for i in range(4):
        m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor

    return m

Then, the variables permit computing eknn

In [11]:
'''
    EKNN uses the pignistic probability and predicts the class.
    It returns the prediction and the real value
'''
def eknn(index, train_size, y, k):

    # Considering k neightbours
    index_voisins, _ , _ = voisins(index, train_size, y, k)
    
    # Compute pignistic probability (BetP)
    betp = np.zeros(4)  
    
    #add each mass associated at each neightbour and divide by the size (k)
    for j in range(len(index_voisins)):
        betp += m_j(index_voisins[j], train_size, y, k)
    betp /= len(index_voisins)
    
    indice_classe_pred = np.argsort(betp)[0]
    
    classe_pred = classe[indice_classe_pred]
    
    return classe_pred, y[index]

In [15]:
pred, real = eknn(10000, len(X_train), y, 7)

In [12]:
#Try the EKNN with 10 neightbours

true = 0
false = 0
k = 10
for i in range(len(X_train), len(data)):
    pred, expected = eknn(i, len(X_train), y, 10)
    if pred == expected :
        true += 1
    else :
        false += 1

In [13]:
print(true/ (len(data)-len(X_train)))

0.9561189663578742


The accuracy is around 93-95% with 10 neighbours

In [14]:
X = [1, 2, 3, 4, 5, 10, 15, 20, 25, 35, 50]
accuracy= np.zeros(len(X))

for k in range(len(X)):
    true = 0
    false = 0
    k = X[k]
    for i in range(len(X_train), len(data)):
        pred, expected = eknn(i, len(X_train), y, k)
        if pred == expected :
            true += 1
        else :
            false += 1
    accuracy[k] = true / (len(data) - len(X_train))

IndexError: index 15 is out of bounds for axis 0 with size 11

Observing the evolution of accuracy with differents value of k

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.xlabel("Evolution of k")
plt.ylabel("Accuracy")
plt.title("Evolution of accuracy according to k")
plt.plot()

### c. Implement the expected utility model with eknn.

In [208]:
def expected_utility_eknn(index, train_size, y, k, utility_matrix):
    # Considering k neighbors
    index_voisins, _, _ = voisins(index, train_size, y, k)

    # Compute pignistic probability (BetP)
    betp = np.zeros(4)

    # Add each mass associated with each neighbor and divide by the size (k)
    for j in range(len(index_voisins)):
        betp += m_j(index_voisins[j], train_size, y, k)
    betp /= len(index_voisins)

    # Use the utility matrix to compute expected utility
    expected_utilities = np.transpose(np.dot(utility_matrix.values, np.transpose(betp)))
    print(expected_utilities.shape)
    # Find the index of the class with the maximum expected utility
    indice_classe_pred = np.argmax(expected_utilities)
    
    classe_pred = classe[indice_classe_pred]

    return classe_pred, y[index]

In [209]:
#Try the EKNN with 10 neightbours

true_utility = 0
bad_prediction = []
not_predict = []
k = 10

true_utility = 0
for i in range(len(X_train), len(data)):
    pred, expected = expected_utility_eknn(i, len(X_train), y, 10, utility_matrix)
    if pred == expected :
        true_utility += 1
    else :
        bad_prediction.append(pred)
        not_predict.append(expected)

(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)
(4,)


In [210]:
print(true_utility/ (len(data)-len(X_train)))

0.0009751340809361287
