In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import timeit
from mlmodel import *
import pickle
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
import utils
from torchvision import transforms
import torchvision.models as models
from torch import nn
import json
from sklearn.preprocessing import MaxAbsScaler

In [2]:
!ls ../ZSTL_Data/hetrec2011-lastfm-2k/

[31martists.dat[m[m                       [31muser_artists.dat[m[m
[34mextracted_feature[m[m                 [31muser_friends.dat[m[m
[31mreadme.txt[m[m                        [31muser_taggedartists-timestamps.dat[m[m
[31mtags.dat[m[m                          [31muser_taggedartists.dat[m[m


In [3]:
path_user_artist = '../ZSTL_Data/hetrec2011-lastfm-2k/user_artists.dat'
path_artist = '../ZSTL_Data/hetrec2011-lastfm-2k/artists.dat'
path_user_artist_tag = '../ZSTL_Data/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat'
df_user_artist = pd.read_csv(path_user_artist, sep="\t")
print(df_user_artist, len(df_user_artist.artistID.unique()))
df_user_artist_tag = pd.read_csv(path_user_artist_tag, sep="\t")
print(df_user_artist_tag, len(df_user_artist_tag.userID.unique()),\
    len(df_user_artist_tag.artistID.unique()),len(df_user_artist_tag.tagID.unique()))

df_artist = pd.read_csv(path_artist, sep="\t")
print(df_artist, len(df_artist.id.unique()))
print(df_artist.loc[df_artist.id.eq(14103)])

userID  artistID  weight
0           2        51   13883
1           2        52   11690
2           2        53   11351
3           2        54   10300
4           2        55    8983
...       ...       ...     ...
92829    2100     18726     337
92830    2100     18727     297
92831    2100     18728     281
92832    2100     18729     280
92833    2100     18730     263

[92834 rows x 3 columns] 17632
        userID  artistID  tagID      timestamp
0            2        52     13  1238536800000
1            2        52     15  1238536800000
2            2        52     18  1238536800000
3            2        52     21  1238536800000
4            2        52     41  1238536800000
...        ...       ...    ...            ...
186474    2100     16437      4  1277935200000
186475    2100     16437    292  1272664800000
186476    2100     16437   2087  1277935200000
186477    2100     16437   2801  1272664800000
186478    2100     16437   3335  1277935200000

[186479 rows x 4 columns] 

In [4]:
num_user = len(df_user_artist.userID.unique())
num_artist = len(df_artist.id.unique())
num_tag = len(df_user_artist_tag.tagID.unique())
print(num_user, num_artist, num_tag)

1892 17632 9749


In [5]:
def genUserItem_table(df_user_artist, num_user, num_artist):
    artistID_to_X_row = {}
    userID_to_Y_row = {}

    y = np.zeros((num_user, num_artist))
    print('y shape ', y.shape)
    i = 0
    j = 0
    for r in df_user_artist.iterrows():
        #print(r[0])
        #print(r[1].to_numpy())
        user_artist_count = r[1].to_numpy()
        #print(user_artist_count)
        if user_artist_count[0] not in userID_to_Y_row.keys():
            userID_to_Y_row[user_artist_count[0]] = i
            cur_i = i
            i += 1
        else:
            cur_i = userID_to_Y_row[user_artist_count[0]]

        if user_artist_count[1] not in artistID_to_X_row.keys():
            artistID_to_X_row[user_artist_count[1]] = j
            cur_j = j
            j += 1
        else:
            cur_j = artistID_to_X_row[user_artist_count[1]]
        
        #print(cur_i, cur_j)
        y[cur_i, cur_j] = 1

    sparse = 1 - np.sum(y)/(num_user * num_artist)
    print('sparsity ', sparse, np.sum(y))
    print(y[y>1])
    print('i, j ', i ,j)
    return y, userID_to_Y_row, artistID_to_X_row

y, userID_to_Y_row, artistID_to_X_row = genUserItem_table(df_user_artist, num_user, num_artist)

y shape  (1892, 17632)
sparsity  0.9972171848800758 92834.0
[]
i, j  1892 17632


In [6]:
def gen_X_n_Attr(df_user_artist_tag, userID_to_Y_row, artistID_to_X_row, num_user, num_artist, num_tag):
    tag_to_X_col = {}
    tagged_artist_to_a_col = {}
    xcol = 0
    acol = 0
    num_tag_artist = len(df_user_artist_tag.artistID.unique())

    x = np.zeros((num_artist, num_tag))
    print('x shape ', x.shape)
    a = np.zeros((num_user, num_tag_artist))
    print('a shape ', a.shape)

    not_shown_artist = []

    for r in df_user_artist_tag.iterrows():
        #print(r[0])
        #print(r[1].to_numpy())
        user_artist_tag = r[1].to_numpy()
        
        if user_artist_tag[1] not in tagged_artist_to_a_col:
            tagged_artist_to_a_col[user_artist_tag[1]] = acol
            cur_acol = acol
            acol += 1
        else:
            cur_acol = tagged_artist_to_a_col[user_artist_tag[1]]
        cur_arow = userID_to_Y_row[user_artist_tag[0]]
        a[cur_arow, cur_acol] += 1

            
        if user_artist_tag[2] not in tag_to_X_col:
            tag_to_X_col[user_artist_tag[2]] = xcol
            cur_xcol = xcol
            xcol += 1
        else:
            cur_xcol = tag_to_X_col[user_artist_tag[2]] 

        if user_artist_tag[1] in artistID_to_X_row:
            cur_xrow = artistID_to_X_row[user_artist_tag[1]]
            x[cur_xrow, cur_xcol] += 1
        
        #a = ppp
    ones = np.ones((x.shape[0], 1))
    x = np.concatenate([ones, x], axis=1)
    print('check sum, a', np.sum(a), a.shape)
    print('check sum, x', np.sum(x), x.shape)
    idx = np.argwhere(np.all(a[..., :] == 0, axis=0))
    a = np.delete(a, idx, axis=1)
    idx = np.argwhere(np.all(x[..., :] == 0, axis=0))
    x = np.delete(x, idx, axis=1)
    print('check sum, a', np.sum(a), a.shape)
    print('check sum, x', np.sum(x), x.shape)


    #standarize 
    # transformer = MaxAbsScaler().fit(a)
    # a_MaxAbsScaled = transformer.transform(a)
    # print('a_MaxAbsScaled ', np.sum(np.max(a_MaxAbsScaled, axis=0)), np.sum(np.max(a_MaxAbsScaled, axis=1)))

    a_colSum = np.sum(a, axis=1)
    a_colNormalize = a/np.expand_dims(a_colSum, axis=1)
    print('a_colNormalize ', np.sum(a_colNormalize, axis=0), np.sum(a_colNormalize, axis=1), np.sum(a_colNormalize))

    #normalize


    #print('not shown artist ', len(not_shown_artist), not_shown_artist)
    return x, a, a

print('len ', len(artistID_to_X_row))
x, a, a_normalize = gen_X_n_Attr(df_user_artist_tag, userID_to_Y_row, artistID_to_X_row, num_user, num_artist, num_tag)

len  17632
x shape  (17632, 9749)
a shape  (1892, 12523)
check sum, a 71064.0 (1892, 12523)
check sum, x 202573.0 (17632, 9750)
check sum, a 71064.0 (1892, 12523)
check sum, x 202573.0 (17632, 9719)


In [7]:
#print(x, list(np.sum(x, axis=0)))
temp = list(np.sum(x, axis=0))
print(sum([i == 0  for i in temp]))

#print(a, list(np.sum(a, axis=0)))
temp = list(np.sum(a, axis=0))
print(sum([i == 0  for i in temp]))
print(np.mean(np.sum(a, axis=1)), a)

0
0
37.56025369978858 [[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 1.]]


In [8]:
sum_y = np.sum(y, axis=1)
print('sum y ', np.max(sum_y), np.min(sum_y), np.mean(sum_y), np.median(sum_y))
# for i in sum_y:
#     print(i)

sum y  50.0 1.0 49.06659619450317 50.0


In [10]:
def genCompressedData(userID_to_Y_row, artistID_to_X_row, x, y, a, compressed_size=100):
    compressd_task_byID = {}
    i = 0
    for t_id in userID_to_Y_row.keys():
        #print('t ', t_id, userID_to_Y_row[t_id])
        
        cur_indx = userID_to_Y_row[t_id]
        cur_y = y[cur_indx, :]
        #print('y ', cur_y)
        pos_indx = np.where(cur_y == 1)
        neg_indx = np.where(cur_y == 0)
        #print('pos ', len(pos_indx[0]), pos_indx[0])
        #print('neg ', len(neg_indx[0]), neg_indx[0])
        neg_indx_selected = np.random.choice(neg_indx[0], size=100-len(pos_indx[0]), replace=False)
        #print('neg select ', len(neg_indx_selected), neg_indx_selected)
        data_indx_selected = np.concatenate([pos_indx[0], neg_indx_selected])
        #print('indx selected ', len(indx_selected), indx_selected )

        task_y = cur_y[data_indx_selected]
        task_x = [np.expand_dims(x[i,:], axis=0) for i in data_indx_selected]
        task_x = np.concatenate(task_x, axis=0)
        #print('task_y ', task_y.shape)
        #print('task_x ', task_x.shape)

        clf = LogisticRegression(fit_intercept = False, max_iter=1000,random_state=0).fit(task_x, task_y)
        pred_y = clf.predict(task_x)
        #print('pred_y ', pred_y.shape, 'task_y ', task_y.shape)
        print('acc ', np.sum(pred_y==task_y)/task_y.shape[0])

        param = clf.coef_

        #print('param ', param.shape)

        #compressd_task_byID[str(t_id)] = (a[cur_indx, :], param, task_x, np.atleast_2d(task_y).T)
        compressd_task_byID[t_id] = (cur_indx, param, data_indx_selected)


        # i+= 1
        # if i == 3:
        #     a = pppp

    # with open(destination+'sampled_task_binaryTag.pickle', 'wb') as handle:
    #     pickle.dump(compressd_task_byID, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return 0 

destination = '../ZSTL_Data/hetrec2011-lastfm-2k/extracted_feature/'
genCompressedData(userID_to_Y_row, artistID_to_X_row, x, y, a_normalize, destination)

acc  0.98
acc  0.84
acc  0.95
acc  0.96
acc  0.92
acc  0.97
acc  0.94
acc  0.98
acc  0.99
acc  0.95
acc  1.0
acc  1.0
acc  0.85
acc  0.96
acc  0.89
acc  0.96
acc  0.96
acc  0.93
acc  0.99
acc  0.92
acc  0.97
acc  0.97
acc  0.99
acc  0.94
acc  0.96
acc  0.96
acc  0.99
acc  0.98
acc  0.96
acc  0.96
acc  0.95
acc  0.93
acc  0.97
acc  0.99
acc  0.93
acc  0.99
acc  0.9
acc  0.98
acc  0.98
acc  0.95
acc  1.0
acc  0.99
acc  0.95
acc  0.99
acc  1.0
acc  0.92
acc  0.88
acc  0.99
acc  1.0
acc  0.96
acc  0.93
acc  0.98
acc  0.92
acc  0.88
acc  0.98
acc  0.95
acc  1.0
acc  0.89
acc  0.97
acc  0.98
acc  1.0
acc  0.97
acc  0.95
acc  1.0
acc  0.98
acc  0.95
acc  0.99
acc  0.93
acc  1.0
acc  0.99
acc  0.91
acc  0.99
acc  0.9
acc  0.95
acc  0.93
acc  0.96
acc  0.89
acc  0.75
acc  0.88
acc  0.92
acc  0.99
acc  0.91
acc  0.98
acc  1.0
acc  0.89
acc  0.98
acc  0.98
acc  0.93
acc  0.9
acc  0.99
acc  0.98
acc  0.93
acc  0.94
acc  0.98
acc  0.98
acc  0.98
acc  0.97
acc  0.96
acc  0.97
acc  0.99
acc  0.99
acc

0

In [11]:
def storeTestData(x, y, a, userID_to_Y_row, artistID_to_X_row, destination):
    totData = {}
    totData['x'] = x
    totData['y'] = y
    totData['a'] = a
    totData['userID_to_Y_row'] = userID_to_Y_row
    totData['artistID_to_X_row'] = artistID_to_X_row

    with open(destination+'detailed_data_binaryTag.pickle', 'wb') as handle:
        pickle.dump(totData, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return 0
storeTestData(x, y, a_normalize, userID_to_Y_row, artistID_to_X_row, destination)

0

In [12]:
from torch.utils.data import DataLoader
def genSplits_hectrec(compressed_data, detailed_data, train_size, test_size, support_size, train_batch_size=100):

    task_id = list(compressed_data.keys())
    tot_len = len(task_id)

    support_indx = list(np.random.choice(task_id, size=support_size, replace=False))
    print(len(support_indx))
    temp = [x for x in task_id if x not in support_indx]
    train_indx = list(np.random.choice(temp, size=train_size, replace=False))
    temp = [x for x in temp if x not in train_indx]
    print(len(train_indx))
    test_indx = temp
    print(len(test_indx))

    support_data = utils.Dataset_hetrec([compressed_data[d] for d in support_indx], detailed_data)
    train_data = utils.Dataset_hetrec([compressed_data[d] for d in train_indx], detailed_data)
    test_data = utils.Dataset_hetrec([compressed_data[d] for d in test_indx], detailed_data)

    support_loader = DataLoader(support_data, batch_size=support_size, shuffle=False)
    train_loader = DataLoader(train_data, batch_size=train_batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=test_size, shuffle=True)

    return support_loader, train_loader, test_loader


In [13]:

path_data = '../ZSTL_Data/hetrec2011-lastfm-2k/extracted_feature/'
compressed_data = 'sampled_task.pickle'
detailed_data = 'detailed_data.pickle'

with open(path_data+compressed_data, 'rb') as f:
    compressed_dataset = pickle.load(f)

with open(path_data+detailed_data, 'rb') as f:
    detailed_dataset = pickle.load(f)


In [15]:
total_len = len(compressed_dataset)
print('tot_len ', total_len, compressed_dataset[2])
support_size = 150
test_size = int(total_len*0.2)
train_size = int(total_len - support_size - test_size)
support_loader, train_loader, test_loader = genSplits(compressed_dataset, detailed_dataset, train_size, test_size, support_size)

tot_len  1892 (0, array([[-1.72784869,  0.50704983,  0.30139832, ...,  0.        ,
         0.        ,  0.        ]]), array([    0,     1,     2,     3,     4,     5,     6,     7,     8,
           9,    10,    11,    12,    13,    14,    15,    16,    17,
          18,    19,    20,    21,    22,    23,    24,    25,    26,
          27,    28,    29,    30,    31,    32,    33,    34,    35,
          36,    37,    38,    39,    40,    41,    42,    43,    44,
          45,    46,    47,    48,    49, 10556, 14767,  1654,  9267,
        7242,  3435,  5862,  8099,  3307,  3973,  9003,  7807, 17251,
       16162, 14519,  1073,  4214,  1393, 12544,  9680, 10355,  3158,
        2921, 12854,  3927,  6533, 13634, 10059,  3989,  7949, 14261,
        4394,  6692, 15157,  2093,  2457,  3482, 16200, 15700,  8771,
       15223, 17363,  7165,  7816,   926, 16911,   574,   409,  5483,
        7275]))


NameError: name 'genSplits' is not defined

In [None]:
print(type(support_loader))
support_a, support_w, support_x, support_y = next(iter(support_loader))
support_a, support_w, support_x, support_y = support_a.float(), support_w.float(), support_x.float(), support_y.float()
print(support_a.shape, support_w.shape, support_x.shape, support_y.shape)

NameError: name 'np' is not defined