In [1]:
import copy
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import autograd.numpy as np
from sklearn import datasets, preprocessing
import pandas as pd
from pymanopt.solvers import TrustRegions
from manopt_dr.core import gen_ldr
from manopt_dr.predefined_func_generator import *
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from scipy.interpolate import interp1d

In [2]:
# Generate data

"""
Class Distribution: (out of 214 total instances)
    -- 163 Window glass (building windows and vehicle windows)
       -- 87 float processed  
          -- 70 building windows
          -- 17 vehicle windows
       -- 76 non-float processed
          -- 76 building windows
          -- 0 vehicle windows
    -- 51 Non-window glass
       -- 13 containers
       -- 9 tableware
       -- 29 headlamps
7 labels
"""

X = pd.read_csv("glass.csv", header = None,usecols=[ 1,2,3,4,5,6,7,8,9],delim_whitespace=False)
y = pd.read_csv("glass.csv", header = None,usecols=[ 10],delim_whitespace=False)


n_samples, n_features = X.shape
X = X.values.tolist()
y = y.values.tolist()
mapped = []
for j in y:
    mapped.append(' '.join(map(str, j)) )
y = mapped
d = dict([(b,a) for a,b in enumerate(sorted(set(y)))])
y = [d[i] for i in y]
n_components = 2
X = preprocessing.scale(X)


In [3]:
# generalized cPCA

GCPCA = gen_ldr(gen_cost_gcpca, gen_default_proj)
label_set = [1,2,3,4,5,6,7]


def return_partition(my_list):
    filtered=[]
    for l in range(1,len(my_list)):
        for c in itertools.combinations(my_list,l):
            filtered.append(c)
    return filtered

partitions = return_partition(label_set)

def print_groups(groups):
    s = ""
    for g in range(len(groups)):
        if g == len(groups) - 1:
            s += str(groups[g])
        else:
            s += str(groups[g]) + ","
    return s

def print_groups_index(index):
    groups =  partitions[index]
    s = ""
    for g in range(len(groups)):
        if g == len(groups) - 1:
            s += str(groups[g])
        else:
            s += str(groups[g]) + ","
    return s

#for p in partitions:
#    print(print_groups(p))
#print(len(partitions))




In [4]:
def Trials(g1,g2):    

#    print(par[0])
#    print(par[1])
    y_tg = copy.deepcopy(y)
    y_tg = np.asarray(y_tg)

    for i in g1:
        y_tg[y_tg == i] = -1

    y_bg = copy.deepcopy(y)
    y_bg = np.asarray(y_bg)

    for i in g2:
        y_bg[y_bg == i] = -1

    gcpca = GCPCA(n_components=n_components).fit(X, y_tg, y_bg)
    Z = gcpca.transform(X)
    cost = gcpca.get_final_cost()
    contrastiveness = 1 / cost

    # Plot
    

    tg_x = [Z[i][0] for i in range(len(y)) if y[i] in g1]
    tg_y = [Z[i][1] for i in range(len(y)) if y[i] in g1]
    bg_x = [Z[i][0] for i in range(len(y)) if y[i] in g2]
    bg_y = [Z[i][1] for i in range(len(y)) if y[i] in g2]
    tg_x,tg_y,bg_x, bg_y = bg_x, bg_y, tg_x, tg_y

    #plt.scatter(tg_x, tg_y, marker='^',label = "target", color='r')
    #plt.scatter(bg_x, bg_y, marker='o',label = "background", color='black')
    return contrastiveness

In [5]:
map_dataset = []
for i in range(len(partitions)):
    row = []
    rec = [l for l in label_set if l not in partitions[i]]
    c = Trials(partitions[i],rec)
    row = [ list(partitions[i]), rec ,c]
    map_dataset.append(row)

columns = ["T : {" + print_groups(x[1]) +  "} B : {" + print_groups(x[0]) + "}" for x in map_dataset]
values = [x[2] for x in map_dataset]


    

In [6]:
import csv
print("Contrastiveness")
row_list = [["Target and Background set", "Contrastiveness"]]
for i in range(len(columns)):
    row_list.append([columns[i] ,values[i]])

with open('glass_combinations.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(row_list)

Contrastiveness


In [7]:
def search_maximum(tg_base,bg_base):
    total = np.concatenate((tg_base , bg_base))
    rec = [l for l in label_set if l not in total]
    c = -1
    bg_base = np.concatenate((bg_base,np.array(rec)))
    #print(bg_base)
    while len(rec) != 0:
        e = -1
        for r in rec:
            tg = np.sort(np.append(np.array(copy.deepcopy(tg_base)),r))
            bg = np.delete(np.array(copy.deepcopy(bg_base)),np.where(np.array(copy.deepcopy(bg_base)) == r))
            c1 = Trials(bg,tg)
            if c1 > c:
                e = r
                c = c1
            #print(c1,tg,bg)
        if e == -1:
            return c, tg_base, bg_base
        rec.remove(e)
        tg_base = np.sort(np.append(tg_base,e))
        bg_base = np.delete(bg_base,np.where(bg_base == e))
    return c, tg_base, bg_base

def check_accuracy(tg_base,bg_base):
    ans = []
    for i in map_dataset:
        #print(i)
        if all(x in i[1] for x in tg_base) and all(x in i[0] for x in bg_base):
            row = [ i[1], i[0] ,i[2]]  
            ans.append(row)
    #print(ans)
    max_c = -1
    l1 = []
    l2 = []
    for i in ans:
        if max_c < i[2]:
            max_c = i[2]
            l1 = i[0]
            l2 = i[1]
    #print(tg_base, bg_base)
    c,t,b = search_maximum(tg_base, bg_base)
    print("The starting target set:" + print_groups(tg_base))
    print("The starting background set:" + print_groups(bg_base))    
    print("The output of algorithm:")
    print(list(t),list(b),c)
    print("The actual output")
    print(l1,l2,max_c)
    print("Their differences: " + str(abs(max_c - c)))
    print()

check_accuracy(np.array([1]),np.array([2]))
check_accuracy(np.array([5]),np.array([2,3]))
check_accuracy(np.array([1,2]),np.array([3,7]))

"""    
for i in range(len(partitions)):
    if len(partitions[i]) < len(label_set)-1:
        rec = [l for l in label_set if l not in partitions[i]]
        res = np.random.choice(rec, 1)
        check_accuracy(np.array(partitions[i]),np.array(res))
"""

The starting target set:1
The starting background set:2
The output of algorithm:
[1, 3, 4, 5] [2, 6, 7] 87.9108912274696
The actual output
[1, 3, 4, 5, 6] [2, 7] 87.91089122746759
Their differences: 2.0179413695586845e-12

The starting target set:5
The starting background set:2,3
The output of algorithm:
[1, 4, 5] [2, 3, 6, 7] 30.859658005447642
The actual output
[1, 4, 5, 6, 7] [2, 3] 30.859658005447617
Their differences: 2.4868995751603507e-14

The starting target set:1,2
The starting background set:3,7
The output of algorithm:
[1, 2, 4, 5] [3, 7, 6] 32.89700391535255
The actual output
[1, 2, 4, 5, 6] [3, 7] 32.89700391540545
Their differences: 5.289990667733946e-11



'    \nfor i in range(len(partitions)):\n    if len(partitions[i]) < len(label_set)-1:\n        rec = [l for l in label_set if l not in partitions[i]]\n        res = np.random.choice(rec, 1)\n        check_accuracy(np.array(partitions[i]),np.array(res))\n'

In [8]:
dataset = datasets.load_wine()

X = dataset.data
y = dataset.target
n_samples, n_features = X.shape
n_components = 2
X = preprocessing.scale(X)
label_set = [0,1,2]
partitions = return_partition(label_set)

In [9]:
map_dataset = []
for i in range(len(partitions)):
    row = []
    rec = [l for l in label_set if l not in partitions[i]]
    c = Trials(partitions[i],rec)
    row = [ list(partitions[i]), rec ,c]
    map_dataset.append(row)

columns = ["T : {" + print_groups(x[1]) +  "} B : {" + print_groups(x[0]) + "}" for x in map_dataset]
values = [x[2] for x in map_dataset]


In [10]:
import csv
print("Contrastiveness")
row_list = [["Target and Background set", "Contrastiveness"]]
for i in range(len(columns)):
    row_list.append([columns[i] ,values[i]])

with open('wine_combinations.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(row_list)

Contrastiveness


In [11]:
for i in range(len(partitions)):
    if len(partitions[i]) < len(label_set) - 1:
        rec = [l for l in label_set if l not in partitions[i]]
        res = np.random.choice(rec, 1)
        check_accuracy(np.array(partitions[i]),np.array(res))

The starting target set:0
The starting background set:1
The output of algorithm:
[0, 2] [1] 6.472079454534599
The actual output
[0, 2] [1] 6.472079454534487
Their differences: 1.1191048088221578e-13

The starting target set:1
The starting background set:2
The output of algorithm:
[0, 1] [2] 34.05370575578451
The actual output
[0, 1] [2] 34.053705755784534
Their differences: 2.1316282072803006e-14

The starting target set:2
The starting background set:0
The output of algorithm:
[1, 2] [0] 21.215761318817975
The actual output
[1, 2] [0] 21.2157613188183
Their differences: 3.232969447708456e-13

