# Decision theory project

In [3]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split

In [81]:
# Create train and test datasets
data = pd.read_csv("./PlasticsTrain.csv", sep=";")

y = data["class"]
X = data.drop(["class", "line", "column", "object"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

classe = ['ABS', 'HIPS', 'PE', 'PP']

In [82]:
# Distances
newDF = pd.concat([X_train, X_test])

# Convertir toutes les colonnes en float
newDF = newDF.replace(',', '.', regex=True)
newDF = newDF.astype(float)

distances = pdist(newDF.values, metric='euclidean')
dist_matrix = squareform(distances)

In [84]:
def fit_gamma(train_label, dist, classes):
    gamma = []
    nbC = len(classes)
    for i in range(nbC):
        C_indices = [t for t in range(len(train_label)) if (train_label.iloc[t] == classes[i])]
        C_dist = []
        for ci in range(len(C_indices)):
            for cj in range(ci + 1, len(C_indices)):
                C_dist.append(dist[C_indices[ci]][C_indices[cj]])
        gamma.append(1 / np.mean(C_dist) if len(C_dist) > 0 else np.nan)
    return gamma

In [86]:
label = X.columns

coeff_gamma = fit_gamma(y_train, dist_matrix, classe)

In [87]:
#We have the coefficients gamma_i for each class
coeff_gamma

[0.3807849377993026, nan, 0.21556149380000975, 0.19077490914257633]

In [88]:
#define coefficient phi and we fix alpha = 0.95
def phi(gamma, d, alpha=0.95):
    return [alpha*np.exp(-gamma[i]*d**2) for i in range(len(gamma))]

In [89]:
phi(coeff_gamma, 4)

[0.0021466381770633503, nan, 0.03018901401298609, 0.04488319454188748]

In [90]:
def voisins(indice, train_size, y):
    indice_triee = np.argsort(dist_matrix[indice])
    indice_voisins = []
    compteur = 1
    while len(indice_voisins) < 5:
        if(indice_triee[compteur]<train_size):
            indice_voisins.append(indice_triee[compteur])
            
        compteur += 1
    distance_voisins = dist_matrix[indice, indice_voisins]
    classes = y[indice_voisins]
    return indice_voisins, distance_voisins, classes

In [91]:
a, b, c = voisins(10000, len(X_train), y)

In [92]:
print(c)

2706    HiPS
4397    HiPS
7384      PE
7908      PP
7824      PP
Name: class, dtype: object


In [54]:
y.unique()

array(['ABS', 'HiPS', 'PE', 'PP'], dtype=object)

In [93]:
def m_j(indice, train_size, y):
    indice_voisins, distance_voisins, classes = voisins(indice, train_size, y)
    coeff_ABS, coeff_HIPS, coeff_PE, coeff_PP = 1, 1, 1, 1

    #calcul des coefficients
    for i in range(len(indice_voisins)) :
        current_class = str(classes.iloc[i])
        if current_class == 'ABS':
            coeff_ABS *= (1- phi(coeff_gamma, distance_voisins[i])[0])
        elif current_class == 'HIPS':
            coeff_HIPS *= (1- phi(coeff_gamma, distance_voisins[i])[1])
        elif current_class == 'PE':
            coeff_PE *= (1- phi(coeff_gamma, distance_voisins[i])[2])
        elif current_class == 'PP':
            coeff_PP *= (1- phi(coeff_gamma, distance_voisins[i])[3])
    
    # calcul the fourth m_i(A|X) = [A=theta_i, A!=theta_i]
    m_ABS=[1-coeff_ABS, coeff_ABS]
    m_HIPS=[1-coeff_HIPS, coeff_HIPS]
    m_PE=[1-coeff_PE, coeff_PE]
    m_PP=[1-coeff_PP, coeff_PP]

    m_j =  np.array([m_ABS, m_HIPS,m_PE, m_PP])

    # normalization coeff 
    product = np.prod(m_j)
    sum = 0
    for i in range(4):
        sum += m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1]
    n_factor = product + sum

    # calcul m = [m({ABS}), m({HIPS}), m({PE}), m({PP})]
    m = np.zeros(4)
    for i in range(4):
        m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor

    return m

In [94]:
a = m_j(10000, len(X_train), y)

In [95]:
a

array([0.       , 0.       , 0.0933047, 0.9066953])

In [96]:
def eknn(indice, train_size, y):
    label = y.unique()

    #on considère k voisins
    indice_voisins, _ , _ = voisins(indice, train_size, y)
    
    # Calcul des masses m_i(A|X) pour chaque voisin
    masses = [m_j(ind, train_size, y) for ind in indice_voisins]
    
    # Calcul de la probabilité pignistique (BetP)
    betp = np.zeros(4)  
    
    for i in range(4):
        for j in range(len(indice_voisins)):
            betp[i] += masses[j][i]
    
    betp /= len(indice_voisins)
    
    indice_classe_pred = np.argsort(betp)[-1]
    
    classe_pred = label[indice_classe_pred]
    
    return classe_pred, y[indice]

In [97]:
true = 0
false = 0

for i in range(len(X_train), len(data)):
    pred, expected = eknn(i, len(X_train), y)
    if pred == expected :
        true += 1
    else :
        false += 1
print(true, false)


  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i][0]*m_j[(i+1)%4][1]*m_j[(i+2)%4][1]*m_j[(i+3)%4][1] ) / n_factor
  m[i] = ( m_j[i

74 1977
