In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import timeit
from mlmodel import *
import pickle
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
import utils

In [2]:
path_feature = '../ZSTL_Data/Animals_with_Attributes2/Features/ResNet101/AwA2-features.txt'
path_labels = '../ZSTL_Data/Animals_with_Attributes2/Features/ResNet101/AwA2-labels.txt'
path_attributes = '../ZSTL_Data/Animals_with_Attributes2/predicate-matrix-binary.txt'
path_destination = '../ZSTL_Data/Animals_with_Attributes2/splitedTask/'
data_feature = pd.read_csv(path_feature, sep=" ", header=None)
data_labels = pd.read_csv(path_labels, sep=" ", header=None)
data_labels.columns = ["label"]

num_task = 50
num_data = 100

np.random.seed(0)

In [3]:
data = pd.concat([data_labels, data_feature], axis=1)
print(data)

label         0         1         2         3         4         5  \
0          1  0.127028  3.236108  0.935148  0.144205  1.114897  1.502288   
1          1  0.000000  2.466911  0.026454  0.075211  1.159094  3.066645   
2          1  0.383341  1.011904  0.000000  0.054472  0.343532  0.917366   
3          1  0.117190  1.225786  0.001932  0.000000  3.135732  0.061605   
4          1  0.274902  0.337544  0.084937  0.000000  1.788061  0.143165   
...      ...       ...       ...       ...       ...       ...       ...   
37317     38  0.498370  1.883775  0.000000  0.212900  0.195262  0.201184   
37318     38  0.043884  0.309244  0.012275  0.173839  0.893198  0.183430   
37319     38  0.016755  1.105690  0.103399  0.384196  0.469869  0.512281   
37320     38  0.121401  1.050093  0.006921  0.545237  0.884461  0.738782   
37321     38  0.243461  1.255479  0.048471  0.660457  0.147349  1.431781   

              6         7         8  ...      2038      2039      2040  \
0      0.410044  0.0

In [4]:
def generate_compressed_data(data, num_task, num_data):
    '''
    dataframe with labels (col 0) and feature (col 1:)
    '''
    # print('row')
    data_numpy = data.to_numpy()
    data_x = data_numpy[:,1:]
    data_y = np.atleast_2d(data_numpy[:,0])
    print(data_x.shape)
    start = timeit.default_timer()
    # pca = PCA(n_components=1024)
    # data_x_compress = pca.fit_transform(data_x)
    data_x_compress = data_x
    print('data_x_compress ',data_x_compress.shape, data_x_compress)
    stop = timeit.default_timer()
    print('Time:', stop - start)

    data_compress_np = np.hstack((data_y.T, data_x_compress))
    data_compressed =  pd.DataFrame(data_compress_np,
                   columns=['label'] +[i for i in range(data_x_compress.shape[1])])
    print(data_compressed)
    return data_compressed

data_compressed = generate_compressed_data(data, num_task, num_data)

(37322, 2048)
data_x_compress  (37322, 2048) [[1.27028410e-01 3.23610830e+00 9.35147520e-01 ... 6.72237160e-01
  3.22649500e-02 4.07616200e-01]
 [0.00000000e+00 2.46691060e+00 2.64539600e-02 ... 1.12520206e+00
  6.84306000e-03 2.56229610e-01]
 [3.83341340e-01 1.01190424e+00 0.00000000e+00 ... 2.25265384e+00
  2.95226900e-02 3.31381500e-02]
 ...
 [1.67553000e-02 1.10568988e+00 1.03398520e-01 ... 1.88486860e-01
  8.19240000e-04 3.53541340e-01]
 [1.21401340e-01 1.05009258e+00 6.92100000e-03 ... 5.05729500e-02
  0.00000000e+00 2.25771200e-02]
 [2.43460600e-01 1.25547945e+00 4.84714400e-02 ... 1.23365400e-01
  1.57233700e-02 6.21905400e-02]]
Time: 0.003762139999992087
       label         0         1         2         3         4         5  \
0        1.0  0.127028  3.236108  0.935148  0.144205  1.114897  1.502288   
1        1.0  0.000000  2.466911  0.026454  0.075211  1.159094  3.066645   
2        1.0  0.383341  1.011904  0.000000  0.054472  0.343532  0.917366   
3        1.0  0.117190  

In [5]:
def generate_split(splits, num_data):
    indx = [ x for x in range(num_data)]
    #print(temp)
    train_indx = list(np.random.choice(indx, size=splits['train'], replace=False))
    temp = [x for x in indx if x not in train_indx]
    #print(len(train_indx))
    # val_indx = list(np.random.choice(temp, size=55, replace=False))
    test_indx = temp
    #print(len(test_indx))
    return train_indx, test_indx

def task_data_split(data, num_task, num_data, splits, random_state=1):
    task_train_byID = {}
    task_test_byID = {}
    task_val_byID = {}
  
    for i in range(num_task):   
        task_data = data.loc[data['label'].eq(i+1)]
        
        sampled_task_data = task_data.sample(n=num_data, random_state=i)
        #print(sampled_task_data.shape, sampled_task_data)
        sampled_data = data.sample(n=num_data, random_state=i)

        train_indx, test_indx = generate_split(splits, num_data)

        task_train_byID[i+1] = np.vstack((sampled_task_data.iloc[train_indx].to_numpy(), \
            sampled_data.iloc[train_indx].to_numpy()))
        task_train_byID[i+1] = np.hstack( (task_train_byID[i+1], np.ones((len(train_indx)*2, 1))) )
        
        temp = task_train_byID[i+1][:, 0]==(i+1)
        task_train_byID[i+1][:, 0][temp==True] = 1.
        task_train_byID[i+1][:, 0][temp==False] = 0.
        print(task_train_byID[i+1].shape, np.sum(task_train_byID[i+1][:,0]))


        task_test_byID[i+1] = np.vstack((sampled_task_data.iloc[test_indx].to_numpy(), \
            sampled_data.iloc[test_indx].to_numpy()))
        task_test_byID[i+1] = np.hstack( (task_test_byID[i+1], np.ones((len(test_indx)*2, 1))) )

        temp = task_test_byID[i+1][:, 0]==(i+1)
        task_test_byID[i+1][:, 0][temp==True] = 1.
        task_test_byID[i+1][:, 0][temp==False] = 0.
        print(task_test_byID[i+1].shape, np.sum(task_test_byID[i+1][:,0]))

    print('tot task ', len(list(task_train_byID.keys())))
    return task_train_byID, task_test_byID


splits = {}
splits['train'] = 100
splits['test'] = 0
task_train_byID, task_test_byID = task_data_split(data_compressed, num_task, num_data, splits)

(200, 2050) 104.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 104.0
(0, 2050) 0.0
(200, 2050) 103.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 103.0
(0, 2050) 0.0
(200, 2050) 103.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 103.0
(0, 2050) 0.0
(200, 2050) 104.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 104.0
(0, 2050) 0.0
(200, 2050) 100.0
(0, 2050) 0.0
(200, 2050) 102.0
(0, 2050) 0.0
(200, 2050) 101.0
(0, 2050) 0.0
(200, 2050) 104.0
(0, 2050) 0.0
(200, 20

In [6]:
net = FuncRecursiveNet([
    FLinearLayer(1, False)
])

In [7]:
#test the coef in logistic regression

def singleTaskTrain(task_train, task_test):
    X = task_train[:, 1:]
    y = task_train[:, 0]

    clf = LogisticRegression(fit_intercept = False, max_iter=100, C=1.0, random_state=0).fit(X, y)
    pred_y = clf.predict(X)

    #X_test = task_test[:, 1:]
    # y_test = task_test[:, 0]
    pred_y_test = clf.predict(X)

    param = clf.coef_
    print('pred_y_test ', np.sum(pred_y_test == y)/y.shape[0], param.shape)

    #bias = clf.intercept_
    #return np.hstack((param, np.atleast_2d(bias)))
    return param, 1

weight = singleTaskTrain(task_train_byID[1], task_test_byID[1])

pred_y_test  1.0 (1, 2049)


In [8]:
init_param = net.initialize_weights(utils.toTensor(task_train_byID[1][:, 1:]))
p_lst = [p.size() for p in init_param]
print(p_lst)
shape_record = {}
for i , p in enumerate(p_lst):
    shape_record[i] = [tuple(p)]
print(shape_record)

[torch.Size([1, 2049])]
{0: [(1, 2049)]}


In [9]:

# net_pred_y_test = net(reshape_w, xtest)
# print('net_pred_y_test ', net_pred_y_test.shape)
# net_pred_y_test = torch.sigmoid(net_pred_y_test )
# net_pred_y_test[net_pred_y_test >= 0.5] = 1.
# net_pred_y_test[net_pred_y_test < 0.5] = 0.
# print((utils.toNumpy(net_pred_y_test) == np.atleast_2d(pred_y_test).T).all())


In [10]:
def gen_attr(path_attributes):
    data_attributes = pd.read_csv(path_attributes, sep="\n", header=None)
    print(data_attributes)
    lst = []
    task_attr_byID = {}
    i = 1
    for r in data_attributes.iterrows():
        # print(len(r), )
        s = r[1].to_numpy()
        # print(len(s[0].split()), s[0].split())
        print(len(s[0].split()))
        task_attr_byID[i] = np.array([float(a) for a in s[0].split()])
        print(task_attr_byID[i].shape)
        i += 1
    return task_attr_byID

def gen_attr_zScore(path_attributes):
    data_attributes = pd.read_csv(path_attributes, sep="\n", header=None)
    print(data_attributes)
    lst = []
    task_attr_byID = {}
    i = 1
    for r in data_attributes.iterrows():
        #print(len(r), )
        s = r[1].to_numpy()
        # print(len(s[0].split()), s[0].split())
        #print(r[0], len(s[0].split()))
        lst.append(np.atleast_2d(np.array([float(a) for a in s[0].split()])))
        # print(task_attr_byID[i].shape)
        # i += 1
    attr_mat = np.concatenate(lst, axis=0)
    print('attr_mat ', attr_mat.shape, attr_mat)
    # attr_mean = np.mean(attr_mat, axis=0)
    # print('attr_mean ', attr_mean.shape, attr_mean)


    # attr_sub_mean = attr_mat - attr_mean
    # print('sub ', attr_sub_mean.shape, attr_mat - attr_sub_mean)

    # attr_std = np.std(attr_mat, axis=0)
    # print('attr_std ', attr_std.shape, attr_std)

    # attr_z = attr_sub_mean/attr_std
    # print('z score', attr_z.shape, attr_sub_mean/attr_z)

    # print('ffff ',attr_z)

    for t in range(attr_mat.shape[0]):
        task_attr_byID[t+1] = attr_mat[t,:]
    
    print(list(task_attr_byID.keys()))
    return task_attr_byID

task_attr_byID = gen_attr_zScore(path_attributes)

0
0   0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 ...
1   1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 ...
2   1 1 0 0 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 0 0 0 ...
3   0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 ...
4   1 1 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 1 ...
5   0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 1 ...
6   1 1 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 ...
7   1 0 0 1 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 ...
8   0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 ...
9   1 1 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 1 ...
10  1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 1 ...
11  1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 1 ...
12  1 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 1 ...
13  0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 ...
14  1 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 1 ...
15  0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 ...
16  1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 ...
17  1 0 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 ...
18  0 0 0 0 1 0 0 0 0 0 0 

In [11]:
def ZSTL_train_test_val(num_task, task_train_byID, task_test_byID, task_attr_byID, destination):
    task_train_data = {}
    task_test_data = {}
    task_val_data = {}
    acc = 0
    for t in range(num_task):
        weight, a = singleTaskTrain(task_train_byID[t+1], task_test_byID[t+1])
        acc += a
        cur_task_train = (task_attr_byID[t+1], weight, task_train_byID[t+1][:,1:], np.atleast_2d(task_train_byID[t+1][:,0]).T)
        cur_task_test = (task_attr_byID[t+1], weight, task_test_byID[t+1][:,1:], np.atleast_2d(task_test_byID[t+1][:,0]).T)

        task_train_data[t+1] = cur_task_train
        task_test_data[t+1] = cur_task_test

    print(len(task_train_data))
    print(len(task_test_data))
    
    with open(destination+'task_train_data_binary.pickle', 'wb') as handle:
        pickle.dump(task_train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(destination+'task_test_data_binary.pickle', 'wb') as handle:
        pickle.dump(task_test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print('mean acc ', acc/num_task)



ZSTL_train_test_val(num_task, task_train_byID, task_test_byID, task_attr_byID, path_destination)

pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
pred_y_test  1.0 (1, 2049)
p