In [1]:
# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
get_ipython().magic('env CUDA_VISIBLE_DEVICES = ')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import os.path
import random
import time
from collections import OrderedDict
import io
from datetime import datetime
import gc # garbage collector
import math
import sys
from collections import defaultdict
import re
import logging
from tensorboard_logger import configure, log_value

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
get_ipython().magic('matplotlib inline')
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')

env: CUDA_VISIBLE_DEVICES=


## Write a pandas dataframe to disk as gunzip compressed csv
- df.to_csv('dfsavename.csv.gz', compression='gzip')

## Read from disk
- df = pd.read_csv('dfsavename.csv.gz', compression='gzip')

## Magic useful
- %%timeit for the whole cell
- %timeit for the specific line
- %%latex to render the cell as a block of latex
- %prun and %%prun

In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/WSDM/'
HDF_FILENAME = DATASET_PATH + 'datas.h5'
HDF_FILENAME_TEMPSAVE = DATASET_PATH + 'datas_temp.h5'
SUBMISSION_FILENAME = DATASET_PATH + 'submission_{}.csv'
VALIDATION_INDICE = DATASET_PATH + 'validation_indice.csv'

LOG_DIR = DATASET_PATH + 'music_logs'

In [3]:
def set_logging(logger_name, logger_file_name):
    log = logging.getLogger(logger_name)
    log.setLevel(logging.DEBUG)

    # create formatter and add it to the handlers
    print_formatter = logging.Formatter('%(message)s')
    file_formatter = logging.Formatter('%(asctime)s - %(name)s_%(levelname)s: %(message)s')

    # create file handler which logs even debug messages
    fh = logging.FileHandler(logger_file_name, mode='w')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(file_formatter)
    log.addHandler(fh)
    # both output to console and file
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(print_formatter)
    log.addHandler(consoleHandler)
    
    return log

In [4]:
log = set_logging('MUSIC', DATASET_PATH + 'music_pytorch.log')
log.info('here is an info message.')

here is an info message.


In [5]:
gc.collect()
train_use = pd.read_csv(DATASET_PATH + 'msno_artist_train.csv', compression='gzip')
gc.collect()
validation_use = pd.read_csv(DATASET_PATH + 'msno_artist_val.csv', compression='gzip')
test = pd.read_csv(DATASET_PATH + 'msno_artist_test.csv', compression='gzip')
#test_id =  test['id']
gc.collect()

0

In [6]:
print(len(train_use), len(validation_use), len(test))

7277418 100000 2556790


- msno: 0-34402
- artist_name: 0-45045
- count:0, 1, 2-8, 9-496

In [7]:
from tensorboard_logger import configure, log_value
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from torch.autograd import Variable
import torch

embedding_dim = 64
batch_size = 64

msno_count = 34403
artist_count = 45046

log_step_interval = 1000

test_examples_num = 2556790
num_steps_per_val_epoch = int(test_examples_num / (batch_size)) + 1

class MusicDataset(Dataset):
    def __init__(self, data_input):
        super(MusicDataset, self).__init__()
        self._msno = data_input['msno'].values
        self._artist = data_input['artist_name'].values
        self._num_examples = len(data_input)
        
    def __len__(self):
        return self._num_examples

    def __getitem__(self, idx):
        return (self._msno[idx], self._artist[idx])
    

class NCF(nn.Module):
    def __init__(self):
        super(NCF, self).__init__()
        self._msno_embedding = nn.Embedding(msno_count, embedding_dim)
        self._artist_embedding = nn.Embedding(artist_count, embedding_dim)
    
        self.layer1 = nn.Sequential(nn.Linear(2 * embedding_dim, 64), nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(64, 32), nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(32, 16), nn.ReLU(True))
        self.dropout = nn.Dropout(0.5)
        #self.layer4 = nn.Sequential(nn.Linear(16, 4), nn.ReLU(True))
        self.layer4 = nn.Linear(16, 4)
        #self.softmax = nn.LogSoftmax()
        #self.softmax = nn.Softmax()

    def forward(self, msno_id, artist_id):
        msno_embedded = self._msno_embedding(msno_id)
        artist_embedded = self._artist_embedding(artist_id)
        input_x = torch.cat((msno_embedded, artist_embedded), 1)
        input_x = self.layer1(input_x)
        input_x = self.layer2(input_x)
        input_x = self.layer3(input_x)
        input_x = self.dropout(input_x)
        input_x = self.layer4(input_x)
        
        return input_x#self.softmax(input_x)

def restore_from_checkpoint(log_dir, model):
    checkpoint_filename = None
    if os.path.exists(os.path.join(log_dir, 'checkpoint')):
        with open(os.path.join(log_dir, 'checkpoint'), 'r') as checkpoint_file:
            for _, line in enumerate(checkpoint_file):
                if line.strip() != '':
                    checkpoint_filename = line.strip()
                    break
    if (not os.path.isdir(log_dir)) or (checkpoint_filename is None):
        return None
    
    model.load_state_dict(torch.load(checkpoint_filename, map_location=lambda storage, loc: storage))    

    log.info('model resotred from: {}.'.format(checkpoint_filename))
    
    return torch.load(checkpoint_filename.replace('state','others'))    

NCF_encoder = NCF()

if torch.cuda.is_available():
    NCF_encoder = NCF_encoder.cuda()

restore_from_checkpoint(LOG_DIR, NCF_encoder)

test_set = MusicDataset(test)

test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers = 4, drop_last=False)
    
#target_onehot = torch.FloatTensor(batch_size, 4)

# check on validation every epoches            
NCF_encoder.eval()
last_val_step = 1
step_start_time = time.time()

logits_df = None
for index, data in enumerate(test_loader, 1):
    msno, artist = data

    msno_tensor = torch.LongTensor(msno)
    artist_tensor = torch.LongTensor(artist)

    if torch.cuda.is_available():
        msno_in = Variable(msno_tensor).cuda()
        artist_in = Variable(artist_tensor).cuda()
    else:
        msno_in = Variable(msno_tensor)
        artist_in = Variable(artist_tensor)

    logits = NCF_encoder(msno_in, artist_in)
    if logits_df is None:
        logits_df = pd.DataFrame(logits.data.numpy())
        logits_df.columns = ['logits_0', 'logits_1', 'logits_2', 'logits_3'] 
    else:
        temp_df = pd.DataFrame(logits.data.numpy())
        temp_df.columns = ['logits_0', 'logits_1', 'logits_2', 'logits_3'] 
        logits_df = pd.concat([logits_df, temp_df], axis = 0, join="outer")
    #logits_df = logits_df.reset_index(drop=True)

    #print(logits_df)
    #print(logits.data.numpy())
    #if index > 2:
    #    break
    if index % log_step_interval == 0:
        log.info('####### test logging #######')
        log.info('\ttest/current_step: {}/{}'.format(index, num_steps_per_val_epoch))
        log.info('\ttest/sec_per_step: {:.3f}'.format((time.time()-step_start_time)/(index-last_val_step+1)))
        last_val_step = index
        step_start_time = time.time()
logits_df = logits_df.reset_index(drop=True)
logits_df.to_csv(DATASET_PATH + 'msno_artist_logits_test.csv', index = False, compression='gzip')
log.info('Done: {}'.format(datetime.now().strftime('%Y-%m-%d_%H_%M_%S')))

model resotred from: /media/rs/0E06CD1706CD0127/Kapok/WSDM/music_logs/checkpoint_pytorch_state_2017-11-18_03_39_18.pth.
####### test logging #######
	test/current_step: 1000/39950
	test/sec_per_step: 0.005
####### test logging #######
	test/current_step: 2000/39950
	test/sec_per_step: 0.005
####### test logging #######
	test/current_step: 3000/39950
	test/sec_per_step: 0.005
####### test logging #######
	test/current_step: 4000/39950
	test/sec_per_step: 0.006
####### test logging #######
	test/current_step: 5000/39950
	test/sec_per_step: 0.006
####### test logging #######
	test/current_step: 6000/39950
	test/sec_per_step: 0.006
####### test logging #######
	test/current_step: 7000/39950
	test/sec_per_step: 0.007
####### test logging #######
	test/current_step: 8000/39950
	test/sec_per_step: 0.007
####### test logging #######
	test/current_step: 9000/39950
	test/sec_per_step: 0.008
####### test logging #######
	test/current_step: 10000/39950
	test/sec_per_step: 0.007
####### test loggin