In [1]:
import numpy as np
import scipy as sp
import pylab as pl
import pandas as pd
import torch
import math
import ot
import ot.plot
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split, StratifiedShuffleSplit
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from scipy.stats import entropy
from sklearn.cluster import SpectralClustering
#Other approaches
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.datasets import load_wine
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
import warnings
warnings.filterwarnings("ignore")
import random
import statistics

In [2]:
def OptimalTransportPropagation(XTrain, XTest, metric, reg, kClass, yTrain, yTest, alpha, a=None, b =None):
    yPredicted = pd.Series([])
    nbTrain = yTrain.size
    nbTest = yTest.size
    nbIter=0
    while yPredicted.size < nbTest:
        #Compute the cost matrix
        M = ot.dist(x1=XTrain,x2=XTest,metric=metric)
        #M /= M.max()
        #M /= M.max()
        # Define a and b
        if(a==None):
            a = np.repeat((1/M.shape[0]),M.shape[0])
            
        if(b==None):
            b = np.repeat((1/M.shape[1]),M.shape[1])
            
        # Apply the sinkhorn optimal transport algorithm
        T = ot.bregman.sinkhorn_knopp(a=a, b=b, M=M, reg=reg)
        #if tensor  keepdim=True instead of keepdims
        P = T / T.sum(0,keepdims=True)
        df = pd.DataFrame(P,yTrain)
        df['key'] = yTrain.values
        df = df.groupby('key').aggregate('sum')
        U = torch.tensor(df.values).t()
        yOutput = torch.argmax(U,1)
        entropyScore = 1 - (entropy(U,axis=1)/math.log(kClass.size))
        growerAlpha = np.where(entropyScore>=alpha)[0]
        lowerAlpha = np.where(entropyScore<alpha)[0]
        yTrain = yTrain.append(yTest.iloc[growerAlpha], ignore_index=True)
        yTest = pd.Series(yTest.values[lowerAlpha])
        yPredicted = yPredicted.append(pd.Series(yOutput[growerAlpha]), ignore_index=True)
        XTrain  = XTrain.append(XTest.iloc[growerAlpha], ignore_index=True)
        XTest['Score'] = pd.Series(entropyScore, index=XTest.index)
        XTest = XTest[XTest['Score']<alpha]
        if(XTest.size>0):
            XTest = XTest.drop(['Score'], axis=1)
        a=b=None
        nbIter +=1
        if(nbIter==10):
            alpha/=1.5
            nbIter=0

    XTest = XTrain.iloc[nbTrain:]   
    yTest = yTrain.iloc[nbTrain:]
    XTrain = XTrain.iloc[:nbTrain]   
    yTrain = yTrain.iloc[:nbTrain]
    return XTrain, XTest, yTrain, yTest, yPredicted

In [3]:
def read_data(dataName):
    dataFile = "../data/"+ dataName+".csv"
    data = pd.read_csv(dataFile, sep=",", header=None)
    if(dataName=="Iris"):
        X = data.loc[:,:3] 
        y = data.loc[:,4]
        reg = 0.016
        alpha = 0.9
        L = [1172] 
    if(dataName=="Wine"):
        X = data.loc[:,:12] 
        y = data.loc[:,13]
        reg = 0.016
        alpha = 0.7
        #alpha = 0.7,0.7,0.9
        L = [189]
    if(dataName=="Heart"):
        X = data.loc[:,:12] 
        y = data.loc[:,13]
        reg = 0.1
        alpha = 0.9
        L = [133]
    if(dataName=="Breast"):
        X = data.loc[:,:29] 
        y = data.loc[:,30]
        reg = 0.016
        alpha = 0.999
        #alpha = 0.999,0.99,0.9
        L = [120]
    if(dataName=="Digits"):
        X = data.loc[:,:63] 
        y = data.loc[:,64]
        reg = 0.01
        alpha = 0.99999999999999999
        L = [173]
    if(dataName=="WDBC"):
        X = data.loc[:,:30] 
        y = data.loc[:,30]
        reg = 0.016
        alpha = 0.9
        L = [0]
    if(dataName=="MNIST"):
        X = data.loc[:,:783] 
        y = data.loc[:,784]
        reg = 0.9
        alpha = 0.7
        #alpha = 0.7,0.7,0.9
        L = [0]
    if(dataName=="Isolet"):
        X = data.loc[:,:616] 
        y = data.loc[:,617]
        reg = 0.1
        alpha = 0.7
        L = [1]
    if(dataName=="Ionosphere"):
        X = data.loc[:,:33] 
        y = data.loc[:,34]
        reg = 0.1
        alpha = 0.9
        L = [40]
    if(dataName=="Statlog"):
        X = data.loc[:,:35] 
        y = data.loc[:,36]
        reg = 0.01
        alpha = 0.9
        L = [1]
    if(dataName=="Waveform"):
        X = data.loc[:,:20] 
        y = data.loc[:,21]
        reg = 0.016
        alpha=0.9
        L = [0]
    if(dataName=="Dermatology"):
        X = data.loc[:,:33] 
        y = data.loc[:,34]
        reg = 0.01
        alpha=0.9
        L = [0]
    print("\nThe input data withOut labels: \n")
    print(X.head())
    if(dataName!="Iris"):
        X = pd.DataFrame(min_max_scaler.fit_transform(X))
    print("\nLabels of the input data are: ", np.unique(y))
    kClass = np.unique(y)
    print("\nkClass = ", kClass.size)
    return X,y,kClass

In [4]:
def choose_parametre(dataName,i):
    if(dataName=="Iris"):
        reg = 0.016
        alpha = 0.9
        L = [1172] 
    if(dataName=="Wine"):
        reg = 0.016
        alpha = 0.7
        if(i==0 or i==1):
            alpha = 0.7
        if(i==2):
            alpha = 0.7
        L = [189]
    if(dataName=="Heart"):
        reg = 0.1
        alpha = 0.9
        L = [133] 
    if(dataName=="Breast"):
        reg = 0.016
        alpha=0
        if(i==0):
            alpha = 0.999
        if(i==1):
            alpha = 0.99
        if(i==2):
            alpha = 0.9
        L = [120]
    if(dataName=="Digits"):
        reg = 0.01
        alpha = 0.99999999999999999
        L = [173]
    if(dataName=="WDBC"):
        reg = 0.016
        alpha = 0.9
        L = [0]
    if(dataName=="MNIST"):
        reg = 0.9
        alpha=0
        if(i==0 or i==1):
            alpha = 0.7
        if(i==2):
            alpha = 0.9
        L = [0]
    if(dataName=="Isolet"):
        reg = 0.1
        alpha = 0.7
        L = [1]
    if(dataName=="Statlog"):
        reg = 0.01
        alpha = 0.9
        L = [1]
    if(dataName=="Ionosphere"):
        reg = 0.1
        alpha = 0.9
        L = [40]
    if(dataName=="Dermatology"):
        reg = 0.01
        alpha = 0.9
        L = [0]
    if(dataName=="Waveform"):
        reg = 0.016
        alpha=0
        if(i==0):
            alpha = 0.9
        if(i==1 or i==2):
            alpha = 0.9
        L = [0]
    return reg,alpha,L

In [5]:
def experimentation(dataName):
    T=[0.85,0.75,0.65]
    X,y,kClass=read_data(dataName) 
    __,__,L=choose_parametre(dataName,i=-1)
    for  k in range (9):   
        num = random.randint(0,20)
        L.append(num)
    AccuracyOTP1=np.zeros(len(T))
    AccuracyLP1=np.zeros(len(T))
    AccuracyLS1=np.zeros(len(T))
    NmiOTP1=np.zeros(len(T))
    NmiLP1=np.zeros(len(T))
    NmiLS1=np.zeros(len(T))
    AriOTP1=np.zeros(len(T))
    AriLP1=np.zeros(len(T))
    AriLS1=np.zeros(len(T))
    for i in range (len(T)): 
        reg,alpha,__=choose_parametre(dataName,i)
        AccuracyOTP=np.zeros(len(L))
        AccuracyLP=np.zeros(len(L))
        AccuracyLS=np.zeros(len(L))
        NmiOTP=np.zeros(len(L))
        NmiLP=np.zeros(len(L))
        NmiLS=np.zeros(len(L))
        AriOTP=np.zeros(len(L))
        AriLP=np.zeros(len(L))
        AriLS=np.zeros(len(L))
        for j in range (len(L)):
            XTr, XTt, yTr, yTt = train_test_split(X, y, test_size=T[i], random_state=L[j], stratify=y)
            XTr, XTt, yTr, yTt, yPrd = OptimalTransportPropagation(XTrain=XTr, XTest=XTt, metric = "sqeuclidean",reg = reg, kClass=kClass, yTrain=yTr,yTest=yTt, alpha=alpha)
            AccuracyOTP[j]=round(accuracy_score(yTt, yPrd, normalize=True),4)
            NmiOTP[j]=round(normalized_mutual_info_score(yTt, yPrd),4)
            AriOTP[j]=round(adjusted_rand_score(yTt, yPrd),4)
            XTr, XTt, yTr, yTt = train_test_split(X, y, test_size=T[i], random_state=L[j], stratify=y)
            label_prop_model = LabelPropagation()
            label_prop_model.fit(XTr, yTr)
            yPrd = label_prop_model.predict(XTt)
            AccuracyLP[j]=round(accuracy_score(yTt, yPrd, normalize=True),4)
            NmiLP[j]=round(normalized_mutual_info_score(yTt, yPrd),4)
            AriLP[j]=round(adjusted_rand_score(yTt, yPrd),4)
            XTr, XTt, yTr, yTt = train_test_split(X, y, test_size=T[i], random_state=L[j], stratify=y)
            label_prop_model = LabelSpreading()
            label_prop_model.fit(XTr, yTr)
            yPrd = label_prop_model.predict(XTt)
            AccuracyLS[j]=round(accuracy_score(yTt, yPrd, normalize=True),4)
            NmiLS[j]=round(normalized_mutual_info_score(yTt, yPrd),4)
            AriLS[j]=round(adjusted_rand_score(yTt, yPrd),4) 
        AccuracyOTP1[i]=statistics.mean(AccuracyOTP)
        AccuracyLP1[i]=statistics.mean(AccuracyLP)
        AccuracyLS1[i]=statistics.mean(AccuracyLS)
        NmiOTP1[i]=statistics.mean(NmiOTP)
        NmiLP1[i]=statistics.mean(NmiLP)
        NmiLS1[i]=statistics.mean(NmiLS)
        AriOTP1[i]=statistics.mean(AriOTP)
        AriLP1[i]=statistics.mean(AriLP)
        AriLS1[i]=statistics.mean(AriLS)
    Acc=[AccuracyOTP1,AccuracyLP1,AccuracyLS1]
    Accuracy=pd.DataFrame(data=np.transpose(Acc),columns=["OTP","LP","LS"], index=["T=0.85","T=0.75","T=0.65"])
    Nmi=[NmiOTP1,NmiLP1,NmiLS1]
    NMI=pd.DataFrame(data=np.transpose(Nmi),columns=["OTP","LP","LS"], index=["T=0.85","T=0.75","T=0.65"])
    Ari=[AriOTP1,AriLP1,AriLS1]
    ARI=pd.DataFrame(data=np.transpose(Ari),columns=["OTP","LP","LS"], index=["T=0.85","T=0.75","T=0.65"])
    results=pd.concat([Accuracy,NMI,ARI],axis=1)
    return results

In [6]:
results=experimentation("Wine")


The input data withOut labels: 

      0     1     2     3    4     5     6     7     8     9     10    11  \
0  14.23  1.71  2.43  15.6  127  2.80  3.06  0.28  2.29  5.64  1.04  3.92   
1  13.20  1.78  2.14  11.2  100  2.65  2.76  0.26  1.28  4.38  1.05  3.40   
2  13.16  2.36  2.67  18.6  101  2.80  3.24  0.30  2.81  5.68  1.03  3.17   
3  14.37  1.95  2.50  16.8  113  3.85  3.49  0.24  2.18  7.80  0.86  3.45   
4  13.24  2.59  2.87  21.0  118  2.80  2.69  0.39  1.82  4.32  1.04  2.93   

     12  
0  1065  
1  1050  
2  1185  
3  1480  
4   735  

Labels of the input data are:  [0 1 2]

kClass =  3


In [328]:
print(results)

            OTP       LP       LS      OTP       LP       LS      OTP  \
T=0.85  0.87960  0.83009  0.83109  0.46762  0.35020  0.35358  0.57239   
T=0.75  0.88713  0.84394  0.84621  0.50001  0.38483  0.39119  0.59279   
T=0.65  0.89784  0.84586  0.84760  0.53830  0.39721  0.40144  0.62814   

             LP       LS  
T=0.85  0.42213  0.42485  
T=0.75  0.46067  0.46733  
T=0.65  0.46507  0.47028  
