In [1]:
import numbers
from sklearn.preprocessing import minmax_scale
from numpy.random import *
from random import *
import numpy as np
import pandas as pd
from math import *
from Metriques import *
from Outils import *
from statistics import *

## Extraction des données

Rand Index correspond à la proportion des paires d'éléments qui sont conjointement groupées ou conjointement séparées.
Aide : https://fr.wikipedia.org/wiki/Indice_de_Rand#:~:text=L'indice%20de%20Rand%20est,'accord)%20entre%20deux%20partitions.

## Fonction KPrototypes

In [2]:
def preparation_donnees(Df):
    numerical = []
    
    for k in Df :
        if isinstance(Df.iloc[0][k],numbers.Real) :
            numerical.append(k)

    for i in range(len(Df)):
        if (Df.iloc[i].isna().any()):
            for k in numerical :
                if pd.isna(Df.iloc[i][k]):
                    Df.loc[i,k] = Df[k].mean()
    return Df

In [3]:
def silhouette(cluster,gamma):
    numerical = []
    categorical = []
    
    for k in cluster[0] :
        if isinstance(cluster[0].iloc[0][k],numbers.Real) :
            numerical.append(k)
        else:
            categorical.append(k)
    
    sc = 0
    for c in range(len(cluster)):
        sc_ = 0
        for i in range(len(cluster[c])):
            ai = sum([gamma_distance(cluster[c].iloc[i],cluster[c].iloc[a],gamma,numerical,categorical) for a in range(len(cluster[c]))])/(len(cluster[c])-1)
            bi = []
            for oc in range(len(cluster)): 
                if c!=oc :
                    bi.append(sum([gamma_distance(cluster[c].iloc[i],cluster[oc].iloc[b],gamma,numerical,categorical) for b in range(len(cluster[oc]))])/len(cluster[oc])) 
            
            sc_ += (min(bi)-ai)/max(ai,min(bi))
        sc += sc_/len(cluster[c])
    s = sc/len(cluster) 
    return s  

In [4]:
def gamma_distance(pointA,pointB,gamma,numericalKeys,categoricalKeys) :
    res = np.linalg.norm(np.array(pointA[numericalKeys])-np.array(pointB[numericalKeys]))+gamma*dissimilarity(pointB[categoricalKeys],pointA[categoricalKeys])
    return res

In [5]:
# La fonction n'a pas marché pour KPrototype
def Initial_Centers_KProt(K,data,num_keys,cat_keys):
    num_array = np.array(data[num_keys])
    num_centers = random_centers(K,num_array)
    cat_centers = k_modes_center(K,data[cat_keys])
    df_num = pd.DataFrame(num_centers,columns=num_keys)
    return pd.concat([df_num,cat_centers])

In [6]:
def KPrototype(K,data,max_iter,gamma):
    """
    Renvoie les centres, les clusters et les labels des K clusters de data
    """
    # Tri des clés ayant des valeurs catégoriques et numériques
    number_keys = []
    categorical_keys = []
    
    for k in data :
        if isinstance(data.iloc[0][k],numbers.Real) :
            number_keys.append(k)
        else:
            categorical_keys.append(k)
    
    # Préparation des 2 ensembles de données
    categorical = data[categorical_keys]
    numerical = pd.DataFrame(minmax_scale(np.array(list(list(data[k]) for k in number_keys))).T,columns=number_keys)
    
    dataset = pd.concat([numerical,categorical],axis=1)
    centers = dataset.iloc[sample(range(0,data.shape[0]),K)]
    #centers = Initial_Centers_KProt(K,data,number_keys,categorical_keys)
    
    i = 0
    not_centers_same = True
    
    while (i!=max_iter and not_centers_same):
        label = []
        i+=1
        
        # Remise à zéro des clusters
        cluster = dict((i,pd.DataFrame(columns=(number_keys+categorical_keys))) for i in range(K))
        
        for d in range(data.shape[0]):
            value_clusters = []
            for i in range(K):
                distance = np.linalg.norm(np.array(numerical.iloc[d])-np.array(centers.iloc[i][number_keys]))+gamma*dissimilarity(centers.iloc[i][categorical_keys],categorical.iloc[d])
                value_clusters.append((distance,i))
            
            minimum_center = min(value_clusters)
            cluster[minimum_center[1]].loc[len(cluster[minimum_center[1]])] = dataset.iloc[d]
            label.append(minimum_center[1])
          
        
        # Recalcule les centres
        new_centers= [new_centroid_KProt(cluster[i],number_keys,categorical_keys) for i in range(K)]
        new_centers = pd.concat(new_centers,axis=0)
        
        if (np.array_equal(new_centers,centers)): 
            not_centers_same = False
        else:
            centers=new_centers
        
    return (new_centers,cluster,label) 

## Utilisation

### Bank Data

In [7]:
Df = pd.read_csv("Données/bank.csv",sep=";")

In [8]:
X = Df.drop(['y'],axis=1)
Y = Df['y']

In [9]:
X = preparation_donnees(X)

In [9]:
Df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [10]:
rd = []
acc = []
adj_rd = []
for exp in range(4):
    seed(exp)
    print(exp)
    _,_,label= KPrototype(2,X,10,1)
    acc.append(Accuracy(Y,label))
    rd.append(rand_index(Y,label))
    adj_rd.append(Adjusted_Rand_Index(Y,label))

0
1
2
3


In [11]:
mean(adj_rd),pstdev(adj_rd)

(0.011654129536352843, 0.00851973053291839)

In [12]:
mean(acc),pstdev(acc)

(0.5185799601857997, 0.09534578427821283)

In [13]:
mean(rd),pstdev(rd)

(0.5187656227673023, 0.010822249037896043)

### Gamma

La fonction Silhouette mettant trop de temps à s'exécuter à cause des données trop nombreuses (45k) , nous nous passerons de ces résultats

In [10]:
seed(0)

In [11]:
_,cluster,_ = KPrototype(2,X,10,1.5)

In [12]:
silhouette(cluster,1.5)

0.1161787714562687

In [13]:
_,cluster,_ = KPrototype(2,X,10,1)

In [14]:
silhouette(cluster,1)

0.1679826123115655

In [15]:
_,cluster,_ = KPrototype(2,X,10,0.5)

In [16]:
silhouette(cluster,0.5)

0.18982927093579638

### Credit Approval

In [7]:
Df = pd.read_csv("Données/credit-approval.csv",sep=",")

In [8]:
X = Df.drop(['class'],axis=1)
X = preparation_donnees(X)

In [9]:
Y = Df['class']

In [17]:
set(Df['class'])

{'+', '-'}

In [19]:
rd = []
acc = []
adj_rd = []
for exp in range(3):
    seed(exp)
    print(exp)
    _,_,label= KPrototype(2,X,10,1)
    acc.append(Accuracy(Y,label))
    rd.append(rand_index(Y,label))
    adj_rd.append(Adjusted_Rand_Index(Y,label))

0
1
2


In [20]:
mean(rd),pstdev(rd)

(0.6221913716581478, 0.08756824519689782)

In [21]:
mean(acc),pstdev(acc)

(0.6884057971014492, 0.1608499792228928)

In [22]:
mean(adj_rd),pstdev(adj_rd)

(0.2433363694216435, 0.1765612897621576)

** Remarque: quand les données ne sont pas normalisé, les clusters ne sont pas équilibré, il y a tout dans un même cluster **
- Revoir: choix des points aléatoire au début

### Divers gamma

In [26]:
seed(0)

In [27]:
_,cluster,_ = KPrototype(2,X,10,1.5)

In [28]:
silhouette(cluster,1.5)

0.22697005919140606

In [29]:
_,cluster,_ = KPrototype(2,X,10,1)

In [30]:
silhouette(cluster,1)

0.2201687527840766

In [10]:
_,cluster,_ = KPrototype(2,X,10,0.5)

In [11]:
silhouette(cluster,0.5)

0.2222980848530644

## Vote

In [13]:
Df = pd.read_csv("Données/house-votes-84.csv",sep=",")
X = Df.drop(['republican'],axis=1)
Y = Df['republican']

In [14]:
X = preparation_donnees(X)

In [15]:
seed(1)

In [16]:
_,cluster,_ = KPrototype(2,X,10,1.5)

UnboundLocalError: local variable 'numerical' referenced before assignment

Notre fonction ne marche pas lorsque le tableau est rempli uniquement soit d'attributs Catégoriels ou numériques car KModes ou KMeans sont spécialiser pour ces derniers