# Sort Data

## 0. Preliminary

In [2]:
%matplotlib inline

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import os
import sys

from ast import literal_eval

""
sns.set_context("notebook", font_scale=1.5)
plt.rcParams['figure.figsize'] = (17, 5)

import platform
if 'Windows' in platform.platform():
    ROOT_PATH = "D:/PycharmProjects/HMAN"
else:
    ROOT_PATH = "/home/xkliu/PycharmProjects/HMAN"
RAW_DATA_PATH = ROOT_PATH  + "/raw_data"
DATA_PATH = ROOT_PATH + "/data"
os.chdir(ROOT_PATH)
sys.path.append("./")

from kddirkit.utils import utils


## 1. Load Data

In [3]:
track_dtype = {'track_id': int, 'album_id':int, 'album_type':str, 'artist_id':int, 'set_split':str, 'set_subset':str, 'track_genre_top':str, 'track_genres':str, 'track_genres_all':str,'track_title':str}
genres_converters = {'track_genres': literal_eval, 'track_genres_all': literal_eval}

In [4]:
small_data = pd.read_csv(RAW_DATA_PATH+'/small_data.csv',  converters=genres_converters)
small_data_train= pd.read_csv(RAW_DATA_PATH+'/small_data_train.csv', converters=genres_converters)
small_data_test= pd.read_csv(RAW_DATA_PATH+'/small_data_test.csv', converters=genres_converters)
small_data_val= pd.read_csv(RAW_DATA_PATH+'/small_data_val.csv',converters=genres_converters)

In [5]:
medium_data= pd.read_csv(RAW_DATA_PATH+'/medium_data.csv',  converters=genres_converters)
medium_data_train= pd.read_csv(RAW_DATA_PATH+'/medium_data_train.csv',  converters=genres_converters)
medium_data_test= pd.read_csv(RAW_DATA_PATH+'/medium_data_test.csv',  converters=genres_converters)
medium_data_val= pd.read_csv(RAW_DATA_PATH+'/medium_data_val.csv',converters=genres_converters)

In [6]:
large_data= pd.read_csv(RAW_DATA_PATH+'/large_data.csv',  converters=genres_converters)
large_data_train= pd.read_csv(RAW_DATA_PATH+'/large_data_train.csv',  converters=genres_converters)
large_data_test= pd.read_csv(RAW_DATA_PATH+'/large_data_test.csv',  converters=genres_converters)
large_data_val= pd.read_csv(RAW_DATA_PATH+'/large_data_val.csv',  converters=genres_converters)

In [7]:
small_data

Unnamed: 0,track_id,album_id,album_type,artist_id,set_split,set_subset,track_genre_top,track_genres,track_genres_all,track_title
0,2,1,Album,1,training,small,Hip-Hop,[21],[21],Food
1,5,1,Album,1,training,small,Hip-Hop,[21],[21],This World
2,10,6,Album,6,training,small,Pop,[10],[10],Freeway
3,140,61,Album,54,training,small,Folk,[17],[17],Queen Of The Wires
4,141,60,Album,54,training,small,Folk,[17],[17],Ohio
...,...,...,...,...,...,...,...,...,...,...
7995,154308,22780,Album,23208,test,small,Hip-Hop,"[21, 539, 811]","[811, 539, 21]",MIA
7996,154309,22780,Album,23208,test,small,Hip-Hop,"[21, 539, 811]","[811, 539, 21]",A1 Symphony
7997,154413,22789,Live Performance,24252,training,small,Pop,[76],"[10, 76]",Do Easy
7998,154414,22789,Live Performance,24252,training,small,Pop,[76],"[10, 76]",Dead Can Dance (uncensored)


In track_genres column, genres_all's sequence is turbulent.

## 2. Sort Data

In [8]:
# use_data = 'small_data'
# if use_data == 'small_data':
#     data = small_data
#     data_train = small_data_train
#     data_test = small_data_test
#     data_val = small_data_val
# elif use_data == 'medium_data':
#     data = small_data
#     data_train = medium_data_train
#     data_test = medium_data_test
#     data_val = medium_data_val
# elif use_data == 'large_data':
#     data = large_data
#     data_train = large_data_train
#     data_test = large_data_test
#     data_val = large_data_val

In [9]:
genre_id2id = {}
genre_str2id = {}

In [10]:
def init_genre():
    # reading relation ids...
#     global genre_id2id
#     print('reading genre id2id...')
#     f = open(RAW_DATA_PATH + "/fma_metadata/genre_id2id.csv","r")
#     total = (int)(f.readline().strip())
#     for i in range(total):
#         content = f.readline().strip().split(',')
#         genre_id2id[content[0]] = int(content[1])
#     f.close()
    
    global genre_str2id
    print('reading genre str2ids...')
    f = open(RAW_DATA_PATH + "/fma_metadata/genre_str2id.csv","r")
    total = (int)(f.readline().strip())
    for i in range(total):
        content = f.readline().strip().split(',')
        genre_str2id[content[0]] = int(content[1])
    f.close()
init_genre()

reading genre str2ids...


In [11]:
genre_str2id

{'NA': 0,
 'Avant-Garde': 1,
 'International': 2,
 'Blues': 3,
 'Jazz': 4,
 'Classical': 5,
 'Novelty': 6,
 'Comedy': 7,
 'Old-Time / Historic': 8,
 'Country': 9,
 'Pop': 10,
 'Disco': 11,
 'Rock': 12,
 'Easy Listening': 13,
 'Soul-RnB': 14,
 'Electronic': 15,
 'Sound Effects': 16,
 'Folk': 17,
 'Soundtrack': 18,
 'Funk': 19,
 'Spoken': 20,
 'Hip-Hop': 21,
 'Audio Collage': 22,
 'Punk': 25,
 'Post-Rock': 26,
 'Lo-Fi': 27,
 'Field Recordings': 30,
 'Metal': 31,
 'Noise': 32,
 'Psych-Folk': 33,
 'Krautrock': 36,
 'Jazz: Vocal': 37,
 'Experimental': 38,
 'Electroacoustic': 41,
 'Ambient Electronic': 42,
 'Radio Art': 43,
 'Loud-Rock': 45,
 'Latin America': 46,
 'Drone': 47,
 'Free-Folk': 49,
 'Noise-Rock': 53,
 'Psych-Rock': 58,
 'Bluegrass': 63,
 'Electro-Punk': 64,
 'Radio': 65,
 'Indie-Rock': 66,
 'Industrial': 70,
 'No Wave': 71,
 'Free-Jazz': 74,
 'Experimental Pop': 76,
 'French': 77,
 'Reggae - Dub': 79,
 'Afrobeat': 81,
 'Nerdcore': 83,
 'Garage': 85,
 'Indian': 86,
 'New Wave': 8

In [12]:
for indexs in large_data.index:
    origin_data = large_data.loc[indexs].values[0:-1]
origin_data
origin_data[0]
",".join(str(x) for x in origin_data.tolist()) +'\n'

'155314,22940,Live Performance,24357,training,medium,Rock,[25],[25, 12]\n'

In [13]:
# ss = ""
# for s in origin_data:
#     ss += str(s)
# ss

In [14]:
def sort_files(name):
    data = {}
    if name == 'small_data':
        data = small_data
    elif name == 'medium_data':
        data = medium_data
    elif name == 'large_data':
        data = large_data
    hash = {}
    s = 0
    for indexs in data.index:
        origin_data = data.loc[indexs].values[0:-1]
        album_id = data.loc[indexs].values[1]
        artist_id =  data.loc[indexs].values[3]
#         print(album_id, artist_id, data.loc[indexs].values[7])
        genre_name = data.loc[indexs].values[6]
        if  genre_name in genre_str2id:
            genre_id = genre_str2id[genre_name]
        else:
            genre_id = genre_str2id['NA']
#         if data.loc[indexs].values[8]:
#             genre_id =  data.loc[indexs].values[8][-1] 
#         else:
#             genre_id = 0
        s = s + 1
        id = str(album_id)+"#"+str(artist_id)+"#"+str(genre_id)
        if not id in hash:
            hash[id] = []
        hash[id].append("-----".join(str(x) for x in origin_data.tolist()) +'\n')
    f = open(RAW_DATA_PATH + '/' + name + "_sort.txt", "w", encoding="utf-8")
    f.write("%d\n"%(s))
    for i in hash:
        for j in hash[i]:
            f.write(j)
    f.close()

In [15]:
sort_files('small_data')
sort_files('medium_data')
sort_files('large_data')

### 2.1 Sort Training Data 

In [16]:
def sort_train_files(name):
    data = {}
    if name == 'small_data_train':
        data = small_data_train
    elif name == 'medium_data_train':
        data = medium_data_train
    elif name == 'large_data_train':
        data = large_data_train
    hash = {}
    s = 0
    for indexs in data.index:
        origin_data = data.loc[indexs].values[0:-1]
        album_id = data.loc[indexs].values[1]
        artist_id =  data.loc[indexs].values[3]
#         print(album_id, artist_id, data.loc[indexs].values[7])
        genre_name = data.loc[indexs].values[6]
        if  genre_name in genre_str2id:
            genre_id = genre_str2id[genre_name]
        else:
            genre_id = genre_str2id['NA']
#         if data.loc[indexs].values[8]:
#             genre_id =  data.loc[indexs].values[8][-1] 
#         else:
#             genre_id = 0
        s = s + 1
        id = str(album_id)+"#"+str(artist_id)+"#"+str(genre_id)
        if not id in hash:
            hash[id] = []
        hash[id].append("-----".join(str(x) for x in origin_data.tolist()) +'\n')
    f = open(RAW_DATA_PATH + '/' + name + "_sort.txt", "w")
    f.write("%d\n"%(s))
    for i in hash:
        for j in hash[i]:
            f.write(j)
    f.close()

In [17]:
sort_train_files('small_data_train')
sort_train_files('medium_data_train')
sort_train_files('large_data_train')

### 2.2 Sort Validation Data 

In [18]:
def sort_val_files(name):
    data = {}
    if name == 'small_data_val':
        data = small_data_val
    elif name == 'medium_data_val':
        data = medium_data_val
    elif name == 'large_data_val':
        data = large_data_val
    hash = {}
    s = 0
    for indexs in data.index:
        origin_data = data.loc[indexs].values[0:-1]
        album_id = data.loc[indexs].values[1]
        artist_id =  data.loc[indexs].values[3]
#         print(album_id, artist_id, data.loc[indexs].values[7])
        genre_name = data.loc[indexs].values[6]
        if  genre_name in genre_str2id:
            genre_id = genre_str2id[genre_name]
        else:
            genre_id = genre_str2id['NA']
#         if data.loc[indexs].values[8]:
#             genre_id =  data.loc[indexs].values[8][-1] 
#         else:
#             genre_id = 0
        s = s + 1
        id = str(album_id)+"#"+str(artist_id)+"#"+str(genre_id)
        if not id in hash:
            hash[id] = []
        hash[id].append("-----".join(str(x) for x in origin_data.tolist()) +'\n')
    f = open(RAW_DATA_PATH + '/' + name + "_sort.txt", "w")
    f.write("%d\n"%(s))
    for i in hash:
        for j in hash[i]:
            f.write(j)
    f.close()

In [19]:
sort_val_files('small_data_val')
sort_val_files('medium_data_val')
sort_val_files('large_data_val')

### 2.3 Sort Testing Data

In [20]:
def sort_test_files(name):
    data = {}
    if name == 'small_data_test':
        data = small_data_test
    elif name == 'medium_data_test':
        data = medium_data_test
    elif name == 'large_data_test':
        data = large_data_test
    hash = {}
    s = 0
    for indexs in data.index:
        origin_data = data.loc[indexs].values[0:-1]
        album_id = data.loc[indexs].values[1]
        artist_id =  data.loc[indexs].values[3]
#         print(album_id, artist_id, data.loc[indexs].values[7])
        genre_name = data.loc[indexs].values[6]
        if  genre_name in genre_str2id:
            genre_id = genre_str2id[genre_name]
        else:
            genre_id = genre_str2id['NA']
#         if data.loc[indexs].values[8]:
#             genre_id =  data.loc[indexs].values[8][-1] 
#         else:
#             genre_id = 0
        s = s + 1
        id = str(album_id)+"#"+str(artist_id)
        if not id in hash:
            hash[id] = []
        hash[id].append("-----".join(str(x) for x in origin_data.tolist()) +'\n')
    f = open(RAW_DATA_PATH + '/' + name + "_sort.txt", "w")
    f.write("%d\n"%(s))
    for i in hash:
        for j in hash[i]:
            f.write(j)
    f.close()

In [21]:
sort_test_files('small_data_test')
sort_test_files('medium_data_test')
sort_test_files('large_data_test')

## 3. Init Files

In [22]:
def init_files(name):
    print('reading ' + name +' data...')
    f = open(RAW_DATA_PATH + '/'+ name + '.txt','r')
    total = (int)(f.readline().strip())
    print(total)
    sen_len = np.zeros((total), dtype = np.int32)
    sen_label = np.zeros((total), dtype = np.int32)
    sen_label_bottom = np.zeros((total), dtype = np.int32)
    instance_scope = []
    instance_triple = []
    for s in range(total):
        content = f.readline().strip().split('-----')
        album_id = content[1]
        artist_id = content[3]
#         print(content)
        genre_name = content[6]
        if  genre_name in genre_str2id:
            genre_id = genre_str2id[genre_name]
        else:
            genre_id = genre_str2id['NA']
        genre_id_bottom = literal_eval(content[8])
        if genre_id_bottom:
            genre_id_bottom =  genre_id_bottom[-1] 
        else:
            genre_id_bottom = 0
        sen_label[s] = genre_id
        sen_label_bottom[s] = genre_id_bottom
        tup = (album_id,artist_id,genre_id)
        if instance_triple == [] or instance_triple[len(instance_triple) - 1] != tup:
            instance_triple.append(tup)
            instance_scope.append([s,s])
        instance_scope[len(instance_triple) - 1][1] = s
#         if (s+1) % 100 == 0:
#             sys.stdout.write(str(s)+'\r')
#             sys.stdout.flush()
    return np.array(instance_triple), np.array(instance_scope), sen_label, sen_label_bottom

In [23]:
%time
small_instance_triple, small_instance_scope, small_label, small_label_bottom = init_files("small_data_sort")
medium_instance_triple, medium_instance_scope, medium_label, medium_label_bottom = init_files("medium_data_sort")
large_instance_triple, large_instance_scope, large_label, large_label_bottom = init_files("large_data_sort")

CPU times: total: 0 ns
Wall time: 0 ns
reading small_data_sort data...
8000
reading medium_data_sort data...
25000
reading large_data_sort data...
106574


In [24]:
np.save(DATA_PATH+'/' + 'small_instance_triple', small_instance_triple)
np.save(DATA_PATH+'/' + 'small_instance_scope', small_instance_scope)
np.save(DATA_PATH+'/' + 'small_label', small_label)
np.save(DATA_PATH+'/' + 'small_label_bottom', small_label_bottom)
np.save(DATA_PATH+'/' + 'medium_instance_triple', medium_instance_triple)
np.save(DATA_PATH+'/' + 'medium_instance_scope', medium_instance_scope)
np.save(DATA_PATH+'/' + 'medium_label', medium_label)
np.save(DATA_PATH+'/' + 'medium_label_bottom', medium_label_bottom)
np.save(DATA_PATH+'/' + 'large_instance_triple', large_instance_triple)
np.save(DATA_PATH+'/' + 'large_instance_scope', large_instance_scope)
np.save(DATA_PATH+'/' + 'large_label', large_label)
np.save(DATA_PATH+'/' + 'large_label_bottom', large_label_bottom)

### 3.1 Init Training Data

In [25]:
# def init_train_files(name):
#     print('reading ' + name +' data...')
#     f = open(RAW_DATA_PATH + '/'+ name + '.txt','r')
#     total = (int)(f.readline().strip())
#     print(total)
#     sen_len = np.zeros((total), dtype = np.int32)
#     sen_label = np.zeros((total), dtype = np.int32)
#     instance_scope = []
#     instance_triple = []
#     for s in range(total):
#         content = f.readline().strip().split('-----')
#         album_id = content[1]
#         artist_id = content[3]
# #         print(content)'
#         genre_name = content[6]
#         if  genre_name in genre_str2id:
#             genre_id = genre_str2id[genre_name]
#         else:
# #             print(genre_name)
#             genre_id = genre_str2id['NA']
#         genre_id = literal_eval(content[7])
#         if genre_id:
#             genre_id =  genre_id[0] 
#         else:
#             genre_id = 0
#         sen_label[s] = genre_id
#         tup = (album_id,artist_id,genre_id)
#         if instance_triple == [] or instance_triple[len(instance_triple) - 1] != tup:
#             instance_triple.append(tup)
#             instance_scope.append([s,s])
#         instance_scope[len(instance_triple) - 1][1] = s
# #         if (s+1) % 100 == 0:
# #             sys.stdout.write(str(s)+'\r')
# #             sys.stdout.flush()
#     return np.array(instance_triple), np.array(instance_scope), sen_label

In [26]:
%time
small_instance_triple_train, small_instance_scope_train, small_label_train, small_label_bottom_train = init_files("small_data_train_sort")
medium_instance_triple_train, medium_instance_scope_train, medium_label_train, medium_label_bottom_train = init_files("medium_data_train_sort")
large_instance_triple_train, large_instance_scope_train, large_label_train, large_label_bottom_train = init_files("large_data_train_sort")

CPU times: total: 0 ns
Wall time: 0 ns
reading small_data_train_sort data...
6400
reading medium_data_train_sort data...
19922
reading large_data_train_sort data...
84353


In [27]:
# large_instance_triple_train, large_instance_scope_train, large_label_train = init_train_files("large_data_train_sort")

In [28]:
np.save(DATA_PATH+'/' + 'small_train_instance_triple', small_instance_triple_train)
np.save(DATA_PATH+'/' + 'small_train_instance_scope', small_instance_scope_train)
np.save(DATA_PATH+'/' + 'small_train_label', small_label_train)
np.save(DATA_PATH+'/' + 'small_label_train_bottom', small_label_bottom_train)
np.save(DATA_PATH+'/' + 'medium_train_instance_triple', medium_instance_triple_train)
np.save(DATA_PATH+'/' + 'medium_train_instance_scope', medium_instance_scope_train)
np.save(DATA_PATH+'/' + 'medium_train_label', medium_label_train)
np.save(DATA_PATH+'/' + 'medium_label_bottom_train', medium_label_bottom_train)
np.save(DATA_PATH+'/' + 'large_train_instance_triple', large_instance_triple_train)
np.save(DATA_PATH+'/' + 'large_train_instance_scope', large_instance_scope_train)
np.save(DATA_PATH+'/' + 'large_train_label', large_label_train)
np.save(DATA_PATH+'/' + 'large_label_bottom_train', large_label_bottom_train)

### 3.2 Init Validation Data

In [29]:
# def init_val_files(name):
#     print('reading ' + name +' data...')
#     f = open(RAW_DATA_PATH + '/'+ name + '.txt','r')
#     total = (int)(f.readline().strip())
#     print(total)
#     sen_len = np.zeros((total), dtype = np.int32)
#     sen_label = np.zeros((total), dtype = np.int32)
#     instance_scope = []
#     instance_triple = []
#     for s in range(total):
#         content = f.readline().strip().split('-----')
#         album_id = content[1]
#         artist_id = content[3]
# #         print(content)
#         genre_name = content[6]
#         if  genre_name in genre_str2id:
#             genre_id = genre_str2id[genre_name]
#         else:
#             genre_id = genre_str2id['NA']
# #         genre_id = literal_eval(content[8])
# #         if genre_id:
# #             genre_id =  genre_id[-1] 
# #         else:
# #             genre_id = 0
#         sen_label[s] = genre_id
#         tup = (album_id,artist_id,genre_id)
#         if instance_triple == [] or instance_triple[len(instance_triple) - 1] != tup:
#             instance_triple.append(tup)
#             instance_scope.append([s,s])
#         instance_scope[len(instance_triple) - 1][1] = s
# #         if (s+1) % 100 == 0:
# #             sys.stdout.write(str(s)+'\r')
# #             sys.stdout.flush()
#     return np.array(instance_triple), np.array(instance_scope), sen_label

In [30]:
%time
small_instance_triple_val, small_instance_scope_val, small_label_val, small_label_bottom_val = init_files("small_data_val_sort")
medium_instance_triple_val, medium_instance_scope_val, medium_label_val, medium_label_bottom_val = init_files("medium_data_val_sort")
large_instance_triple_val, large_instance_scope_val, large_label_val, large_label_bottom_val = init_files("large_data_val_sort")

CPU times: total: 0 ns
Wall time: 0 ns
reading small_data_val_sort data...
800
reading medium_data_val_sort data...
2505
reading large_data_val_sort data...
10958


In [31]:
np.save(DATA_PATH+'/' + 'small_val_instance_triple', small_instance_triple_val)
np.save(DATA_PATH+'/' + 'small_val_instance_scope', small_instance_scope_val)
np.save(DATA_PATH+'/' + 'small_val_label', small_label_val)
np.save(DATA_PATH+'/' + 'small_label_bottom_val', small_label_bottom_val)
np.save(DATA_PATH+'/' + 'medium_val_instance_triple', medium_instance_triple_val)
np.save(DATA_PATH+'/' + 'medium_val_instance_scope', medium_instance_scope_val)
np.save(DATA_PATH+'/' + 'medium_val_label', medium_label_val)
np.save(DATA_PATH+'/' + 'medium_label_bottom_val', medium_label_bottom_val)
np.save(DATA_PATH+'/' + 'large_val_instance_triple', large_instance_triple_val)
np.save(DATA_PATH+'/' + 'large_val_instance_scope', large_instance_scope_val)
np.save(DATA_PATH+'/' + 'large_val_label', large_label_val)
np.save(DATA_PATH+'/' + 'large_label_bottom_val', large_label_bottom_val)

### 3.3 Init Testing Data

In [32]:
def init_test_files(name):
    print('reading ' + name +' data...')
    f = open(RAW_DATA_PATH + '/'+ name + '.txt','r')
    total = (int)(f.readline().strip())
    print(total)
    sen_label = np.zeros((total), dtype = np.int32)
    sen_label_bottom = np.zeros((total), dtype = np.int32)
    entity_pair = []
    entity_scope = []
    for s in range(total):
        content = f.readline().strip().split('-----')
        album_id = content[1]
        artist_id = content[3]
#         print(content)
        genre_name = content[6]
        if  genre_name in genre_str2id:
            genre_id = genre_str2id[genre_name]
        else:
#             print(genre_name)
            genre_id = genre_str2id['NA']
        genre_id_bottom = literal_eval(content[8])
        if genre_id_bottom:
            genre_id_bottom =  genre_id_bottom[-1] 
        else:
            genre_id_bottom = 0
        sen_label[s] = genre_id
        sen_label_bottom[s] = genre_id_bottom
        pair = (album_id,artist_id)
        if entity_pair == [] or entity_pair[-1] != pair:
            entity_pair.append(pair)
            entity_scope.append([s,s])
        entity_scope[-1][1] = s
#         if (s+1) % 100 == 0:
#             sys.stdout.write(str(s)+'\r')
#             sys.stdout.flush()
    return np.array(entity_pair), np.array(entity_scope),  sen_label, sen_label_bottom

In [33]:
%time
small_instance_triple_test, small_instance_scope_test, small_label_test, small_label_bottom_test = init_test_files("small_data_test_sort")
medium_instance_triple_test, medium_instance_scope_test, medium_label_test, medium_label_bottom_test = init_test_files("medium_data_test_sort")
large_instance_triple_test, large_instance_scope_test, large_label_test, large_label_bottom_test = init_test_files("large_data_test_sort")

CPU times: total: 0 ns
Wall time: 0 ns
reading small_data_test_sort data...
800
reading medium_data_test_sort data...
2573
reading large_data_test_sort data...
11263


In [34]:
np.save(DATA_PATH+'/' + 'small_test_entity_pair', small_instance_triple_test)
np.save(DATA_PATH+'/' + 'small_test_entity_scope', small_instance_scope_test)
np.save(DATA_PATH+'/' + 'small_test_label', small_label_test)
np.save(DATA_PATH+'/' + 'small_label_bottom_test', small_label_bottom_test)
np.save(DATA_PATH+'/' + 'medium_test_entity_pair', medium_instance_triple_test)
np.save(DATA_PATH+'/' + 'medium_test_entity_scope', medium_instance_scope_test)
np.save(DATA_PATH+'/' + 'medium_test_label', medium_label_test)
np.save(DATA_PATH+'/' + 'medium_label_bottom_test', medium_label_bottom_test)
np.save(DATA_PATH+'/' + 'large_test_entity_pair', large_instance_triple_test)
np.save(DATA_PATH+'/' + 'large_test_entity_scope', large_instance_scope_test)
np.save(DATA_PATH+'/' + 'large_test_label', large_label_test)
np.save(DATA_PATH+'/' + 'large_label_bottom_test', large_label_bottom_test)

## 4. Transform Data

### 4.1 initialize bag label for test


In [35]:
from collections import Counter

#### Small

In [36]:
small_label = np.load(DATA_PATH + '/' + 'small_test_label.npy')
small_scope = np.load(DATA_PATH + '/' + 'small_test_entity_scope.npy')
Counter(small_label)

Counter({12: 100,
         2: 100,
         17: 100,
         38: 100,
         1235: 100,
         10: 100,
         21: 100,
         15: 100})

In [37]:
small_label = np.load(DATA_PATH + '/' + 'small_test_label.npy')
small_scope = np.load(DATA_PATH + '/' + 'small_test_entity_scope.npy')
small_label[small_label == 12] = 0
small_label[small_label == 2] = 1
small_label[small_label == 17] = 2
small_label[small_label == 38] = 3
small_label[small_label == 1235] = 4
small_label[small_label == 10] = 5
small_label[small_label == 21] = 6
small_label[small_label == 15] = 7
Counter(small_label)

Counter({0: 100, 1: 100, 2: 100, 3: 100, 4: 100, 5: 100, 6: 100, 7: 100})

In [38]:
small_all_true_label = np.zeros((small_scope.shape[0], np.max(small_label)+1))
for pid in range(small_scope.shape[0]):
    small_all_true_label[pid][small_label[small_scope[pid][0]:small_scope[pid][1]+1]] = 1
small_all_true_label = np.reshape(small_all_true_label[:, 1:], -1)
np.save(DATA_PATH + '/'  + 'small_all_true_label.npy', small_all_true_label)

In [39]:
small_all_true_label

array([0., 0., 0., ..., 0., 1., 0.])

#### Medium

##### Transform Label

In [40]:
transform_medium_dict = {12:0, 2:1, 17:2, 38:3, 1235:4, 10:5, 21:6, 15:7, 14:8, 3:9, 8:10, 4:11, 9:12, 5:13, 20:14, 13:15}

In [41]:
def transform_medium_label(label_pd):
    for i in range(len(label_pd)):
        label_pd[i] = transform_medium_dict.get(label_pd[i])
    return label_pd

In [42]:
medium_label = np.load(DATA_PATH + '/' + 'medium_test_label.npy')
medium_scope = np.load(DATA_PATH + '/' + 'medium_test_entity_scope.npy')
sorted(Counter(medium_label).keys())

[2, 3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 17, 20, 21, 38, 1235]

In [43]:
medium_label_transform = transform_medium_label(np.load(DATA_PATH+'/' + 'medium_label.npy'))
medium_train_label_transform = transform_medium_label( np.load(DATA_PATH+'/' + 'medium_train_label.npy'))
medium_val_label_transform = transform_medium_label(np.load(DATA_PATH+'/' + 'medium_val_label.npy'))
medium_test_label_transform = transform_medium_label(np.load(DATA_PATH+'/' + 'medium_test_label.npy'))

In [44]:
np.save(DATA_PATH+'/' + "medium_label_transform", medium_label_transform)
np.save(DATA_PATH+'/' + "medium_train_label_transform", medium_train_label_transform)
np.save(DATA_PATH+'/' + "medium_val_label_transform", medium_val_label_transform)
np.save(DATA_PATH+'/' + "medium_test_label_transform", medium_test_label_transform)
len(medium_label_transform), len(medium_train_label_transform), len(medium_val_label_transform), len(medium_test_label_transform)

(25000, 19922, 2505, 2573)

In [45]:
len(medium_label), len(medium_label_train), len(medium_label_val), len(medium_label_test)

(2573, 19922, 2505, 2573)

In [46]:
medium_label_test_tramsform = np.load(DATA_PATH + '/' + 'medium_test_label_transform.npy')
medium_scope = np.load(DATA_PATH + '/' + 'medium_test_entity_scope.npy')
sorted(Counter(medium_label_test_tramsform).keys())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [47]:
medium_scope

array([[   0,    0],
       [   1,    6],
       [   7,    9],
       ...,
       [2568, 2569],
       [2570, 2571],
       [2572, 2572]])

In [48]:
medium_all_true_label_transform = np.zeros((medium_scope.shape[0], np.max(medium_label_test_tramsform)+1))
for pid in range(small_scope.shape[0]):
    medium_all_true_label_transform[pid][medium_label_test_tramsform[medium_scope[pid][0]:medium_scope[pid][1]+1]] = 1
medium_all_true_label_transform

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [49]:
medium_all_true_label_transform.shape

(750, 16)

In [50]:
medium_all_true_label_transform = np.reshape(medium_all_true_label_transform[:, :], -1)
np.save(DATA_PATH + '/'  + 'medium_all_true_label_transform.npy', medium_all_true_label_transform)

In [51]:
medium_all_true_label_transform

array([1., 0., 0., ..., 0., 0., 0.])

#### Large

In [52]:
transform_large_dict = {0: 0, 12:1, 2:2, 17:3, 38:4, 1235:5, 10:6, 21:7, 15:8, 14:9, 3:10, 8:11, 4:12, 9:13, 5:14, 20:15, 13:16}

In [53]:
def transform_large_label(label_pd):
    for i in range(len(label_pd)):
        label_pd[i] = transform_large_dict.get(label_pd[i])
    return label_pd

In [54]:
large_label = np.load(DATA_PATH + '/' + 'large_test_label.npy')
large_scope = np.load(DATA_PATH + '/' + 'large_test_entity_scope.npy')
sorted(Counter(large_label).keys())

[0, 2, 3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 17, 20, 21, 38, 1235]

In [55]:
Counter(large_label)

Counter({12: 1464,
         0: 6312,
         38: 1085,
         15: 839,
         3: 13,
         8: 55,
         1235: 309,
         17: 299,
         9: 18,
         10: 204,
         21: 323,
         2: 128,
         5: 87,
         20: 31,
         4: 47,
         14: 43,
         13: 6})

In [56]:
large_label_transform = transform_large_label(np.load(DATA_PATH + '/' + 'large_label.npy'))
large_train_label_transform = transform_large_label(np.load(DATA_PATH + '/' + 'large_train_label.npy'))
large_val_label_transform = transform_large_label(np.load(DATA_PATH + '/' + 'large_val_label.npy'))
large_test_label_transform = transform_large_label(np.load(DATA_PATH + '/' + 'large_test_label.npy'))
np.save(DATA_PATH + '/' + "large_label_transform", large_label_transform)
np.save(DATA_PATH + '/' + "large_train_label_transform", large_train_label_transform)
np.save(DATA_PATH + '/' + "large_val_label_transform", large_val_label_transform)
np.save(DATA_PATH + '/' + "large_test_label_transform", large_test_label_transform)
len(large_label_transform), len(large_train_label_transform), len(large_val_label_transform), len(large_test_label_transform)

(106574, 84353, 10958, 11263)

In [57]:
len(large_label), len(large_label_train), len(large_label_val), len(large_label_test)
large_label_test_tramsform = np.load(DATA_PATH + '/' + 'large_test_label_transform.npy')
large_scope = np.load(DATA_PATH + '/' + 'large_test_entity_scope.npy')
sorted(Counter(large_label_test_tramsform).keys())
large_scope
large_all_true_label_transform = np.zeros((large_scope.shape[0], np.max(large_label_test_tramsform) + 1))
for pid in range(small_scope.shape[0]):
    large_all_true_label_transform[pid][large_label_test_tramsform[large_scope[pid][0]:large_scope[pid][1] + 1]] = 1
large_all_true_label_transform
large_all_true_label_transform = np.reshape(large_all_true_label_transform[:, :], -1)
np.save(DATA_PATH + '/' + 'large_all_true_label_transform.npy', large_all_true_label_transform)
large_all_true_label_transform

array([0., 1., 0., ..., 0., 0., 0.])

### 4.2 Get Bottom ID

#### Medium

In [58]:
medium_label_bottom = np.load(DATA_PATH + '/' + 'medium_label_bottom.npy')
medium_scope = np.load(DATA_PATH + '/' + 'medium_test_entity_scope.npy')
# sorted(Counter(medium_label_botto|m).keys())

In [59]:
transform_csv = pd.read_csv(RAW_DATA_PATH + '/' + 'genre2id.csv')
orig_id = transform_csv.orig_id.to_list()
transform_id = transform_csv.transform_id.to_list()

In [60]:
transform_medium_dict = {}
for i in zip(orig_id, transform_id):
    transform_medium_dict[i[0]] = i[1]
# transform_dict

In [61]:
def transform_medium_label_via_dict(label_pd, transform_medium_dict):
    for i in range(len(label_pd)):
        label_pd[i] = transform_medium_dict[label_pd[i]]
    return label_pd

In [62]:
medium_label_bottom_transform = transform_medium_label_via_dict(np.load(DATA_PATH+'/' + 'medium_label_bottom.npy'), transform_medium_dict)
medium_train_label_bottom_transform = transform_medium_label_via_dict( np.load(DATA_PATH+'/' + 'medium_label_bottom_train.npy'), transform_medium_dict)
medium_val_label_bottom_transform = transform_medium_label_via_dict(np.load(DATA_PATH+'/' + 'medium_label_bottom_val.npy'), transform_medium_dict)
medium_test_label_bottom_transform = transform_medium_label_via_dict(np.load(DATA_PATH+'/' + 'medium_label_bottom_test.npy'), transform_medium_dict)

KeyError: 46

In [None]:
np.save(DATA_PATH+'/' + "medium_label_bottom_transform", medium_label_bottom_transform)
np.save(DATA_PATH+'/' + "medium_train_label_bottom_transform", medium_train_label_bottom_transform)
np.save(DATA_PATH+'/' + "medium_val_label_bottom_transform", medium_val_label_bottom_transform)
np.save(DATA_PATH+'/' + "medium_test_label_bottom_transform", medium_test_label_bottom_transform)
len(medium_label_bottom_transform), len(medium_train_label_bottom_transform), len(medium_val_label_bottom_transform), len(medium_test_label_bottom_transform)

In [None]:
medium_label_bottom

In [None]:
sorted(list(Counter(medium_label_bottom_transform).keys()))

#### Large

In [None]:
large_label_bottom = np.load(DATA_PATH + '/' + 'large_label_bottom.npy')
large_scope = np.load(DATA_PATH + '/' + 'large_test_entity_scope.npy')
sorted(Counter(large_label_bottom).keys())

In [None]:
transform_csv = pd.read_csv(RAW_DATA_PATH + '/' + 'genre2id-large.csv')

In [None]:
orig_id = transform_csv.orig_id.to_list()
transform_id = transform_csv.transform_id.to_list()

In [None]:
transform_large_dict = {}
for i in zip(orig_id, transform_id):
    transform_large_dict[i[0]] = i[1]
# transform_dict
transform_large_dict

In [None]:
def transform_large_label_via_dict(label_pd, transform_large_dict):
    for i in range(len(label_pd)):
        print(label_pd[i])
        label_pd[i] = transform_large_dict[label_pd[i]]
    return label_pd

In [None]:
np.load(DATA_PATH+'/' + 'large_label_bottom.npy')

In [None]:
# large_label_bottom_transform = transform_large_label_via_dict(np.load(DATA_PATH+'/' + 'large_label_bottom.npy'), transform_large_dict)
# large_train_label_bottom_transform = transform_large_label_via_dict( np.load(DATA_PATH+'/' + 'large_label_bottom_train.npy'), transform_large_dict)
# large_val_label_bottom_transform = transform_large_label_via_dict(np.load(DATA_PATH+'/' + 'large_label_bottom_val.npy'), transform_large_dict)
# large_test_label_bottom_transform = transform_large_label_via_dict(np.load(DATA_PATH+'/' + 'large_label_bottom_test.npy'), transform_large_dict)

In [None]:
# np.save(DATA_PATH+'/' + "large_label_bottom_transform", large_label_bottom_transform)
# np.save(DATA_PATH+'/' + "large_train_label_bottom_transform", large_train_label_bottom_transform)
# np.save(DATA_PATH+'/' + "large_val_label_bottom_transform", large_val_label_bottom_transform)
# np.save(DATA_PATH+'/' + "large_test_label_bottom_transform", large_test_label_bottom_transform)
# len(large_label_bottom_transform), len(large_train_label_bottom_transform), len(large_val_label_bottom_transform), len(large_test_label_bottom_transform)

In [None]:
large_label_bottom

In [None]:
sorted(list(Counter(large_label_bottom_transform).keys()))