In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import timeit
from src.mlmodel import *
import pickle
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
import src.utils
from torchvision import transforms
import torchvision.models as models
from torch import nn
import json
from sklearn.preprocessing import MaxAbsScaler

In [2]:
!ls ../ZSTL_Data/hetrec2011-lastfm-2k/

[31martists.dat[m[m                       [31muser_artists.dat[m[m
[34mextracted_feature[m[m                 [31muser_friends.dat[m[m
[31mreadme.txt[m[m                        [31muser_taggedartists-timestamps.dat[m[m
[31mtags.dat[m[m                          [31muser_taggedartists.dat[m[m


In [3]:
path_user_artist = '../ZSTL_Data/hetrec2011-lastfm-2k/user_artists.dat'
path_artist = '../ZSTL_Data/hetrec2011-lastfm-2k/artists.dat'
path_user_artist_tag = '../ZSTL_Data/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat'
path_user_friends = '../ZSTL_Data/hetrec2011-lastfm-2k/user_friends.dat'

df_user_artist = pd.read_csv(path_user_artist, sep="\t")
print(df_user_artist, len(df_user_artist.artistID.unique()))
df_user_artist_tag = pd.read_csv(path_user_artist_tag, sep="\t")
print(df_user_artist_tag, len(df_user_artist_tag.userID.unique()),\
    len(df_user_artist_tag.artistID.unique()),len(df_user_artist_tag.tagID.unique()))

df_user_friends = pd.read_csv(path_user_friends, sep="\t")
print(df_user_friends, len(df_user_friends.userID.unique()), len(df_user_friends.friendID.unique()))

df_artist = pd.read_csv(path_artist, sep="\t")
print(df_artist, len(df_artist.id.unique()))
print(df_artist.loc[df_artist.id.eq(14103)])

userID  artistID  weight
0           2        51   13883
1           2        52   11690
2           2        53   11351
3           2        54   10300
4           2        55    8983
...       ...       ...     ...
92829    2100     18726     337
92830    2100     18727     297
92831    2100     18728     281
92832    2100     18729     280
92833    2100     18730     263

[92834 rows x 3 columns] 17632
        userID  artistID  tagID      timestamp
0            2        52     13  1238536800000
1            2        52     15  1238536800000
2            2        52     18  1238536800000
3            2        52     21  1238536800000
4            2        52     41  1238536800000
...        ...       ...    ...            ...
186474    2100     16437      4  1277935200000
186475    2100     16437    292  1272664800000
186476    2100     16437   2087  1277935200000
186477    2100     16437   2801  1272664800000
186478    2100     16437   3335  1277935200000

[186479 rows x 4 columns] 

In [4]:
num_user = len(df_user_artist.userID.unique())
num_artist = len(df_artist.id.unique())
num_tag = len(df_user_artist_tag.tagID.unique())
print(num_user, num_artist, num_tag)

1892 17632 9749


In [5]:
def genUserItem_table(df_user_artist, num_user, num_artist):
    artistID_to_X_row = {}
    userID_to_Y_row = {}

    y = np.zeros((num_user, num_artist))
    print('y shape ', y.shape)
    i = 0
    j = 0
    for r in df_user_artist.iterrows():
        #print(r[0])
        #print(r[1].to_numpy())
        user_artist_count = r[1].to_numpy()
        #print(user_artist_count)
        if user_artist_count[0] not in userID_to_Y_row.keys():
            userID_to_Y_row[user_artist_count[0]] = i
            cur_i = i
            i += 1
        else:
            cur_i = userID_to_Y_row[user_artist_count[0]]

        if user_artist_count[1] not in artistID_to_X_row.keys():
            artistID_to_X_row[user_artist_count[1]] = j
            cur_j = j
            j += 1
        else:
            cur_j = artistID_to_X_row[user_artist_count[1]]
        
        #print(cur_i, cur_j)
        y[cur_i, cur_j] = 1

    sparse = 1 - np.sum(y)/(num_user * num_artist)
    print('sparsity ', sparse, np.sum(y))
    print(y[y>1])
    print('i, j ', i ,j)
    return y, userID_to_Y_row, artistID_to_X_row

y, userID_to_Y_row, artistID_to_X_row = genUserItem_table(df_user_artist, num_user, num_artist)

y shape  (1892, 17632)
sparsity  0.9972171848800758 92834.0
[]
i, j  1892 17632


In [6]:
def gen_X_n_Attr(df_user_artist_tag, df_user_friends, userID_to_Y_row, artistID_to_X_row, num_user, num_artist, num_tag):
    X_tagRecord_byID = {}
    friend_to_a_col = {}
    tag_to_x_col = {}
    xcol = 0
    acol = 0
    num_tag_artist = len(df_user_artist_tag.artistID.unique())

    a = np.zeros((num_user, num_user))
    print('a shape ', a.shape)
    x = np.zeros((num_artist, num_tag))
    not_shown_artist = []

    #gen attr
    for r in df_user_friends.iterrows():
        #print('friend relation ', r[1].to_numpy())
        user_friend = r[1].to_numpy()
        cur_user = user_friend[0]
        friend = user_friend[1]
        if friend not in friend_to_a_col:
            friend_to_a_col[friend] = acol
            cur_acol = acol
            acol += 1          
        else:
            cur_acol = friend_to_a_col[friend]

        a[userID_to_Y_row[cur_user], cur_acol] = 1
        
    #a = ppp

    for r in df_user_artist_tag.iterrows():
        user_artist_tag = r[1].to_numpy()

        cur_user = user_artist_tag[0]
        cur_artist = user_artist_tag[1]
        cur_tag = user_artist_tag[2]

        if cur_user not in X_tagRecord_byID:
            X_tagRecord_byID[cur_user] = []
            X_tagRecord_byID[cur_user].append((cur_artist, cur_tag))
        else:
            X_tagRecord_byID[cur_user].append((cur_artist, cur_tag))

        if cur_tag not in tag_to_x_col:
            tag_to_x_col[cur_tag] = xcol
            xcol += 1

        if cur_artist in artistID_to_X_row:
            x[artistID_to_X_row[cur_artist], tag_to_x_col[cur_tag]] += 1

    #print('not shown artist ', len(not_shown_artist), not_shown_artist)
    return a, X_tagRecord_byID, tag_to_x_col, x

print('len ', len(artistID_to_X_row))
a, X_tagRecord_byID, tag_to_x_col, x = gen_X_n_Attr(df_user_artist_tag, df_user_friends, userID_to_Y_row, artistID_to_X_row, num_user, num_artist, num_tag)

len  17632
a shape  (1892, 1892)


In [7]:
num_tag = 0
for k in X_tagRecord_byID.keys():
    num_tag += len(X_tagRecord_byID[k])

avg_tag = num_tag/len(list(X_tagRecord_byID.keys()))
print('avg_tag ', avg_tag)

a_meanFriend = np.mean(np.sum(a, axis=1))
print('a_meanFriend ', a_meanFriend)

print('x ', np.sum(x), np.mean(np.sum(x, axis=1)) )
print(np.sum(np.sum(x, axis=0)==0))

avg_tag  98.56183932346723
a_meanFriend  13.44291754756871
x  184941.0 10.48894056261343
31


In [8]:
sum_y = np.sum(y, axis=1)
print('sum y ', np.max(sum_y), np.min(sum_y), np.mean(sum_y), np.median(sum_y))
# for i in sum_y:
#     print(i)

sum y  50.0 1.0 49.06659619450317 50.0


In [9]:
def genCompressedData(userID_to_Y_row, artistID_to_X_row, X_tagRecord_byID, y, a, destination, compressed_size=100):
    compressd_task_byID = {}
    i = 0
    for t_id in userID_to_Y_row.keys():
        #print('t ', t_id, userID_to_Y_row[t_id])
        
        cur_indx = userID_to_Y_row[t_id]
        cur_y = y[cur_indx, :]
        #print('y ', cur_y)
        pos_indx = np.where(cur_y == 1)
        neg_indx = np.where(cur_y == 0)
        #print('pos ', len(pos_indx[0]), pos_indx[0])
        #print('neg ', len(neg_indx[0]), neg_indx[0])
        neg_indx_selected = np.random.choice(neg_indx[0], size=100-len(pos_indx[0]), replace=False)
        #print('neg select ', len(neg_indx_selected), neg_indx_selected)
        data_indx_selected = np.concatenate([pos_indx[0], neg_indx_selected])
        #print('indx selected ', len(indx_selected), indx_selected )
        cur_tag_record = X_tagRecord_byID[t_id]

        compressd_task_byID[t_id] = (cur_indx, data_indx_selected, cur_tag_record)


        # i+= 1
        # if i == 3:
        #     a = pppp

    with open(destination+'sampled_task_0826.pickle', 'wb') as handle:
        pickle.dump(compressd_task_byID, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return 0 

destination = '../ZSTL_Data/hetrec2011-lastfm-2k/extracted_feature/'
genCompressedData(userID_to_Y_row, artistID_to_X_row, X_tagRecord_byID, y, a, destination)

0

In [10]:
def storeTestData( y, a, userID_to_Y_row, artistID_to_X_row, tag_to_x_col, destination):
    totData = {}
    totData['y'] = y
    totData['a'] = a
    print('a ', a)
    totData['userID_to_Y_row'] = userID_to_Y_row
    totData['artistID_to_X_row'] = artistID_to_X_row
    totData['tag_to_x_col'] = tag_to_x_col
    totData['num_artist'] = len(list(artistID_to_X_row.keys()))
    totData['num_tag'] = len(list(tag_to_x_col.keys()))

    with open(destination+'detailed_data_0826.pickle', 'wb') as handle:
        pickle.dump(totData, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return 0
storeTestData( y, a, userID_to_Y_row, artistID_to_X_row, tag_to_x_col, destination)

a  [[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


0