In [1]:
# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
get_ipython().magic('env CUDA_VISIBLE_DEVICES = 1')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp      # will come in handy due to the size of the data
import os.path
import random
import io
from datetime import datetime
import gc # garbage collector
import sklearn
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import math
from collections import defaultdict
import re
import logging

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
get_ipython().magic('matplotlib inline')
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')

env: CUDA_VISIBLE_DEVICES=1




## Write a pandas dataframe to disk as gunzip compressed csv
- df.to_csv('dfsavename.csv.gz', compression='gzip')

## Read from disk
- df = pd.read_csv('dfsavename.csv.gz', compression='gzip')

## Magic useful
- %%timeit for the whole cell
- %timeit for the specific line
- %%latex to render the cell as a block of latex
- %prun and %%prun

In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/WSDM/'
TRAIN_FILE = DATASET_PATH + 'all_train_withextra.csv'
TEST_FILE = DATASET_PATH + 'all_test_withextra.csv'
MEMBER_FILE = DATASET_PATH + 'members.csv'
SONG_FILE = DATASET_PATH + 'fix_songs.csv'
ALL_ARTIST = DATASET_PATH + 'all_artist_name.csv'
ALL_COMPOSER = DATASET_PATH + 'all_composer.csv'
ALL_LYRICIST = DATASET_PATH + 'all_lyricist.csv'

In [3]:
def set_logging(logger_name, logger_file_name):
    log = logging.getLogger(logger_name)
    log.setLevel(logging.DEBUG)

    # create formatter and add it to the handlers
    print_formatter = logging.Formatter('%(message)s')
    file_formatter = logging.Formatter('%(asctime)s - %(name)s_%(levelname)s: %(message)s')

    # create file handler which logs even debug messages
    fh = logging.FileHandler(logger_file_name, mode='w')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(file_formatter)
    log.addHandler(fh)
    # both output to console and file
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(print_formatter)
    log.addHandler(consoleHandler)
    
    return log

In [4]:
log = set_logging('MUSIC', DATASET_PATH + 'music_test_xgboost.log')
log.info('here is an info message.')

here is an info message.


In [5]:
train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)

In [6]:
member_data = pd.read_csv(MEMBER_FILE)
song_data = pd.read_csv(SONG_FILE)

In [7]:
composer_df = pd.read_csv(ALL_COMPOSER)
artist_name_df = pd.read_csv(ALL_ARTIST)
lyricist_df = pd.read_csv(ALL_LYRICIST)

In [8]:
log.info(train_data.head())

                                           msno  \
0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   
1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   
2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   
3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   
4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   

                                        song_id source_system_tab  \
0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   
1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   
2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   
3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   
4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   

    source_screen_name      source_type  target  city  bd  gender  \
0              Explore  online-playlist       1     1   0     NaN   
1  Local playlist more   local-playlist       1    13  24  female   
2  Local playlist more   local-playlist       1    13  24  female   
3 

In [9]:
def clip_by_percent(hist, num_percent):
    return hist[(hist >= hist[int( len(hist.index) * num_percent )]) == True]
def clip_by_value(hist, value):
    return hist[(hist >= value) == True]

In [10]:
def create_bag_of_words(input_df, percent, column_name):
    input_hist = input_df[column_name].value_counts(sort=True, ascending=False)
    input_select = clip_by_percent(input_hist, percent).index
    print('{} item are selected.'.format(len(input_select)))
    # the total number of the other items
    total_others = np.sum(input_hist) - np.sum(input_hist[input_select])
    # all hist values are log transformed accouting the popularity
    clip_hist_with_log = defaultdict(lambda: np.log(total_others))
    for k,v in dict(np.log(input_hist[input_select])).items():
        clip_hist_with_log[k] = v
#     print(input_hist[input_select])   
#     print(dict(np.log(input_hist[input_select])))
    input_map = defaultdict(lambda: column_name + ' ' + 'others')
    for input_item in input_select:
        input_map[input_item] = column_name + ' ' + input_item
    # item name in input_map are "column_name + ' ' + input_item"
    # item name in clip_hist_with_log are "input_item"
    return input_map, clip_hist_with_log

In [11]:
# 181 ms ± 420 µs
def word_bag_encode(input_data, column, word_map, word_hist):
    col_index = input_data.columns.get_loc(column) + 1
    count_list = [0 for _ in range(len(word_map))]
    count_dict = dict(zip(list(word_map.keys()), count_list))
    count_dict['others'] = 0
    new_columns = [column + ' ' + s for s in count_dict.keys()]
    all_df = pd.DataFrame(data = None, columns = new_columns)
    delay_rate = 0.8 # must be less than 1
    for cur_row in input_data.itertuples():
        if isinstance(cur_row[col_index], str): 
            df = pd.DataFrame([list(count_dict.values())], columns=new_columns)
            splited_list = re.split(r'[|/]+',cur_row[col_index])
            list_len = len(splited_list)
            # the weight of each position of the array, are decayed by the ratio delay_rate, and their sum are 1
            # so according to the geometric series summation formula, the iniatial weight are caculate as follow
            initial_weight = (1-delay_rate)/(1 - np.power(delay_rate, list_len))
            for index, s in enumerate(splited_list): 
                word_stripped = s.strip(' \"\t\s\n')
                df[word_map.get(word_stripped, column + ' others')] += initial_weight / (word_hist.get(word_stripped, word_hist['others'])) #word_hist[word_stripped]
                # defaultdict will auto insert missing key
                #df[word_map[word_stripped]] += initial_weight / (word_hist.get(word_stripped, word_hist['others'])) #word_hist[word_stripped]
                initial_weight *= delay_rate
            all_df = all_df.append(df, ignore_index=True)
        # NAN fix
        else:
            all_df = all_df.append(pd.DataFrame([[0] * len(new_columns)], columns=new_columns), ignore_index=True)
    return all_df

In [12]:
# 7.09 ms ± 43.2 µs
def word_bag_encode_apply(input_data, column, word_map, word_hist):
    new_columns = [column + ' ' + s for s in word_map.keys()]
    new_columns.append(column + ' ' + 'others')
    delay_rate = 0.8 # must be less than 1
    
    def encode_routine(str_value):
        series_dict = dict(zip(new_columns, [0.] * len(new_columns)))
        if isinstance(str_value, str): 
            splited_list = re.split(r'[|/]+',str_value)
            list_len = len(splited_list)
            # the weight of each position of the array, are decayed by the ratio delay_rate, and their sum are 1
            # so according to the geometric series summation formula, the iniatial weight are caculate as follow
            initial_weight = (1-delay_rate)/(1 - np.power(delay_rate, list_len))
            for index, s in enumerate(splited_list): 
                word_stripped = s.strip(' \"\t\s\n')
                series_dict[word_map.get(word_stripped, column + ' others')] += initial_weight / (word_hist.get(word_stripped, word_hist['others'])) #word_hist[word_stripped]
                initial_weight *= delay_rate
        return pd.Series(series_dict)
    return input_data[column].apply(lambda s: encode_routine(s))

In [13]:
# 171 µs ± 693 ns
def word_bag_encode_numpy(input_data, column, word_map, word_hist):
    new_columns = [s for s in word_map.keys()]
    new_columns.append('others')
    delay_rate = 0.8 # must be less than 1
    num_columns = len(new_columns)
    str_indice_dict = dict(zip(new_columns, list(range(num_columns))))
    def encode_routine(str_value):
        temp_hist = np.zeros(num_columns, dtype=float)
        if isinstance(str_value, str): 
            splited_list = re.split(r'[|/]+',str_value)
            list_len = len(splited_list)
            # the weight of each position of the array, are decayed by the ratio delay_rate, and their sum are 1
            # so according to the geometric series summation formula, the iniatial weight are caculate as follow
            initial_weight = (1-delay_rate)/(1 - np.power(delay_rate, list_len))
            for index, s in enumerate(splited_list): 
                word_stripped = s.strip(' \"\t\s\n')
                temp_hist[str_indice_dict.get(word_stripped, num_columns-1)] += initial_weight / (word_hist.get(word_stripped, word_hist['others'])) #word_hist[word_stripped]
                initial_weight *= delay_rate
        return temp_hist
    # actually we cannot use vectorize #vf = np.vectorize(encode_routine)

    #def fromiter(x):
    #return np.fromiter((f(xi) for xi in x), x.dtype)

    numpy_str = np.array(input_data[column].values, dtype=object)
    #return np.array(map(encode_routine, numpy_str))
    #return np.fromiter((encode_routine(xi) for xi in numpy_str), numpy_str.dtype, count=len(numpy_str))
    return np.array([encode_routine(xi) for xi in numpy_str]), [column + ' ' + s for s in new_columns]

In [14]:
# example test
#composer_map, composer_hist = create_bag_of_words(composer_df, 0.001, 'composer')
#composer_array, head_name = word_bag_encode_numpy(train_data, 'composer', composer_map, composer_hist)
#composer_encoder = pd.DataFrame(data = composer_array, columns = head_name)

In [None]:
#composer_map, composer_hist = create_bag_of_words(composer_df, 0.001, 'composer')
#%timeit composer_encoder = word_bag_encode(train_data, 'composer', composer_map, composer_hist)

In [None]:
composer_map, composer_hist = create_bag_of_words(composer_df, 0.001, 'composer')
#composer_encoder = word_bag_encode_apply(train_data, 'composer', composer_map, composer_hist)
composer_array, composer_head_name = word_bag_encode_numpy(train_data, 'composer', composer_map, composer_hist)
composer_encoder = pd.DataFrame(data = composer_array, columns = composer_head_name)
del composer_array
print('composer_encoder finished')
artist_name_map, artist_name_hist = create_bag_of_words(artist_name_df, 0.001, 'artist_name')
#artist_name_encoder = word_bag_encode_apply(train_data, 'artist_name', artist_name_map, artist_name_hist)
artist_name_array, artist_name_head_name = word_bag_encode_numpy(train_data, 'artist_name', artist_name_map, artist_name_hist)
artist_name_encoder = pd.DataFrame(data = artist_name_array, columns = artist_name_head_name)
del artist_name_array
print('artist_name_encoder finished')
lyricist_map, lyricist_hist = create_bag_of_words(lyricist_df, 0.002, 'lyricist')
#lyricist_encoder = word_bag_encode_apply(train_data, 'lyricist', lyricist_map, lyricist_hist)
lyricist_array, lyricist_head_name = word_bag_encode_numpy(train_data, 'lyricist', lyricist_map, lyricist_hist)
lyricist_encoder = pd.DataFrame(data = lyricist_array, columns = lyricist_head_name)
del lyricist_array
print('lyricist_encoder finished')

309 item are selected.


In [None]:
train_data.drop('composer', axis=1, inplace=True)
train_data.drop('artist_name', axis=1, inplace=True)
train_data.drop('lyricist', axis=1, inplace=True)
final_train_data = pd.concat([train_data, composer_encoder, artist_name_encoder, lyricist_encoder], join='inner', axis=1, copy=True)
del train_data
del composer_encoder
del artist_name_encoder
del lyricist_encoder

In [None]:
#composer_encoder_test = word_bag_encode_apply(test_data, 'composer', composer_map, composer_hist)
composer_array_test, composer_head_name_test = word_bag_encode_numpy(test_data, 'composer', composer_map, composer_hist)
composer_encoder_test = pd.DataFrame(data = composer_array_test, columns = composer_head_name_test)
del composer_array_test
print('composer_encoder_test finished')
#artist_name_encoder_test = word_bag_encode_apply(test_data, 'artist_name', artist_name_map, artist_name_hist)
artist_name_array_test, artist_name_head_name_test = word_bag_encode_numpy(test_data, 'artist_name', artist_name_map, artist_name_hist)
artist_name_encoder_test = pd.DataFrame(data = artist_name_array_test, columns = artist_name_head_name_test)
del artist_name_array_test
print('artist_name_encoder_test finished')
#lyricist_encoder_test = word_bag_encode_apply(test_data, 'lyricist', lyricist_map, lyricist_hist)
lyricist_array_test, lyricist_head_name_test = word_bag_encode_numpy(test_data, 'lyricist', lyricist_map, lyricist_hist)
lyricist_encoder_test = pd.DataFrame(data = lyricist_array_test, columns = lyricist_head_name_test)
del lyricist_array_test
print('lyricist_encoder_test finished')
test_data.drop('composer', axis=1, inplace=True)
test_data.drop('artist_name', axis=1, inplace=True)
test_data.drop('lyricist', axis=1, inplace=True)
final_test_data = pd.concat([test_data, composer_encoder_test, artist_name_encoder_test, lyricist_encoder_test], join='inner', axis=1, copy=True)
del test_data
del composer_encoder_test
del artist_name_encoder_test
del lyricist_encoder_test

In [None]:
print(final_train_data.head())
print(final_test_data.head())

In [None]:
final_train_data.to_csv(DATASET_PATH + 'all_train_featured.csv', compression='gzip')
final_test_data.to_csv(DATASET_PATH + 'all_test_featured.csv', compression='gzip')