In [12]:
import ot
import numpy as np
import pandas as pd
import sys, os ,warnings
import math, random, statistics
from scipy.stats import entropy
from sklearn import preprocessing
from contextlib import contextmanager
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [13]:
@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

warnings.filterwarnings("ignore")

min_max_scaler = preprocessing.MinMaxScaler()

# OTP algorithm

In [14]:
def OptimalTransportPropagation(X_L, X_U, Y_L, epsilon, alpha, metric, kClass, a=None, b=None):
    nbLabelled = X_L.shape[0]
    y_true_index = []
    alpha_0=alpha
    while len(X_U)>0:
        M = ot.dist(X_L,X_U,metric=metric)
        if(a==None):
            a = np.repeat((1/M.shape[0]),M.shape[0])
        if(b==None):
            b = np.repeat((1/M.shape[1]),M.shape[1])   
        with suppress_stdout():
            T = ot.bregman.sinkhorn_knopp(a=a, b=b, M=M, reg=epsilon)
        P = T / T.sum(0,keepdims=True)
        df = pd.DataFrame(P)
        df['key'] = Y_L.values
        df = df.groupby('key').aggregate('sum')
        U = df.to_numpy().T
        y_pseudo = np.argmax(U,1)
        Certainty_scores = 1 - (entropy(U,axis=1)/math.log(kClass.size))
        alpha=min(alpha,max(Certainty_scores))
        sup_alpha = np.where(Certainty_scores>=alpha)[0]
        Y_L = Y_L.append(pd.Series(y_pseudo[sup_alpha]), ignore_index=True)
        Score = pd.Series(Certainty_scores, index=X_U.index)
        true_index = Score[Score>=alpha].index.to_list()
        y_true_index.extend(true_index)
        X_L = X_L.append(X_U.loc[true_index], ignore_index=True)        
        X_U = X_U[Score<alpha]            
        a=b=None
        alpha=alpha_0
    X_U = X_L.iloc[nbLabelled:]
    X_U.reset_index(drop=True, inplace=True)
    X_U.index = y_true_index
    Y_U_predicted = Y_L.iloc[nbLabelled:]
    Y_U_predicted.reset_index(drop=True, inplace=True)
    Y_U_predicted.index = y_true_index
    Y_U_predicted = Y_U_predicted.sort_index()
    X_L = X_L.iloc[:nbLabelled]   
    Y_L = Y_L.iloc[:nbLabelled]
    X_U = X_U.sort_index()
    return X_L, X_U, Y_L, Y_U_predicted

# Generate Data : Iris and Digits Datasets

In [15]:
def Read_Data(dataName):
    dataFile = "./data/"+ dataName+".csv"
    data = pd.read_csv(dataFile, sep=",", header=None)
    if(dataName=="Iris"):
        X = data.loc[:,:3] 
        Y = data.loc[:,4]
        reg = 0.016
        alpha = 0.9
    if(dataName=="Digits"):
        X = data.loc[:,:63]
        X = pd.DataFrame(min_max_scaler.fit_transform(X))
        Y = data.loc[:,64]
        reg = 0.01
        alpha = 0.9
    kClass = np.unique(Y)
    return X,Y,kClass,reg,alpha

# Training

In [16]:
def Training(dataName):
    T_size=[0.85,0.75,0.65]
    X, Y, kClass, reg, alpha = Read_Data(dataName)
    states=[random.randint(0,2000) for cv in range(10)]
    AccuracyOTP, NmiOTP, AriOTP = [], [], []
    for size in T_size: 
        Accuracy_list, Nmi_list, Ari_list = [], [], []
        for seed in states:
            X_L, X_U, Y_L, Y_U = train_test_split(X, Y, test_size=size, random_state=seed, stratify=Y)
            X_L.reset_index(drop=True, inplace=True)
            X_U.reset_index(drop=True, inplace=True)
            Y_L.reset_index(drop=True, inplace=True)
            Y_U.reset_index(drop=True, inplace=True)
            X_L, X_U_, Y_L, Y_U_Pred = OptimalTransportPropagation(X_L, X_U, Y_L, epsilon=reg, alpha=alpha,
                                                                   metric="sqeuclidean", kClass=kClass)
            Accuracy_list.append(round(accuracy_score(Y_U, Y_U_Pred, normalize=True),4))
            Nmi_list.append(round(normalized_mutual_info_score(Y_U, Y_U_Pred),4))
            Ari_list.append(round(adjusted_rand_score(Y_U, Y_U_Pred),4))
        AccuracyOTP.append(statistics.mean(Accuracy_list))
        NmiOTP.append(statistics.mean(Nmi_list))
        AriOTP.append(statistics.mean(Ari_list))
    results = pd.DataFrame(data=np.transpose([AccuracyOTP, NmiOTP, AriOTP]), 
            columns=["OTP_ACC", "OTP_NMI", "OTP_ARI"], 
            index=["Amount_Prior_Information = 15%","Amount_Prior_Information = 25%","Amount_Prior_Information = 35%"])   
    return results

# Experimentations

# Iris 

In [158]:
results = Training("Iris")

In [159]:
results

Unnamed: 0,OTP_ACC,OTP_NMI,OTP_ARI
Amount_Prior_Information = 15%,0.9609,0.865,0.887
Amount_Prior_Information = 25%,0.9735,0.8995,0.9214
Amount_Prior_Information = 35%,0.9694,0.9052,0.9097


# Digits 

In [161]:
results=Training("Digits")

In [162]:
results

Unnamed: 0,OTP_ACC,OTP_NMI,OTP_ARI
Amount_Prior_Information = 15%,0.9398,0.8759,0.8735
Amount_Prior_Information = 25%,0.9503,0.8978,0.8947
Amount_Prior_Information = 35%,0.9718,0.9399,0.9391
