In [1]:
# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
get_ipython().magic('env CUDA_VISIBLE_DEVICES = 1')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp      # will come in handy due to the size of the data
import os.path
import random
import io
from datetime import datetime
import gc # garbage collector
import sklearn
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import math
from collections import defaultdict
import re
import logging

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
get_ipython().magic('matplotlib inline')
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')

env: CUDA_VISIBLE_DEVICES=1




In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/WSDM/'
TRAIN_FILE = DATASET_PATH + 'all_train_withextra.csv'
TEST_FILE = DATASET_PATH + 'all_test_withextra.csv'
MEMBER_FILE = DATASET_PATH + 'members.csv'
SONG_FILE = DATASET_PATH + 'fix_songs.csv'
ALL_ARTIST = DATASET_PATH + 'all_artist_name.csv'
ALL_COMPOSER = DATASET_PATH + 'all_composer.csv'
ALL_LYRICIST = DATASET_PATH + 'all_lyricist.csv'
HDF_FILENAME = DATASET_PATH + 'music_info.h5'

In [3]:
train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)
member_data = pd.read_csv(MEMBER_FILE)
song_data = pd.read_csv(SONG_FILE)
composer_df = pd.read_csv(ALL_COMPOSER)
artist_name_df = pd.read_csv(ALL_ARTIST)
lyricist_df = pd.read_csv(ALL_LYRICIST)

In [4]:
def convert_unicode_to_str(df):
    df.columns = df.columns.astype(str)
    types = df.apply(lambda x: pd.api.types.infer_dtype(df.values))
    #print(types)#mixed-integer
    for col in types[types == 'mixed-integer'].index:
        df[col] = df[col].astype(str)
    for col in types[types == 'mixed'].index:
        df[col] = df[col].astype(str)
    return df

In [5]:
store = pd.HDFStore(HDF_FILENAME)
store['all_train_withextra'] = convert_unicode_to_str(train_data)
store['all_test_withextra'] = convert_unicode_to_str(test_data)
store['members'] = convert_unicode_to_str(member_data)
store['fix_songs'] = convert_unicode_to_str(song_data)
store['all_composer'] = convert_unicode_to_str(composer_df)
store['all_artist_name'] = convert_unicode_to_str(artist_name_df)
store['all_lyricist'] = convert_unicode_to_str(lyricist_df)
store.close()

In [6]:
store_test = pd.HDFStore(HDF_FILENAME)
print(store_test['all_composer'])
store_test.close()

                          composer
0                               董貞
1                            TEDDY
2                    FUTURE BOUNCE
3                       Bekuh BOOM
4                              湯小康
5                      Traditional
6                     Joe Hisaishi
7                     Jonathan Lee
8                               光良
9                           JJ Lin
10                Stephen Garrigan
11                Mark Prendergast
12                     Vincent May
13                    Jacknife Lee
14                    Jason Boland
15                             陳偉強
16                              白安
17           Michael William Balfe
18                             蔡旻佑
19                    Chris Martin
20                    Guy Berryman
21                  Jonny Buckland
22                Mikkel S Eriksen
23                   Will Champion
24              Tor Erik Hermansen
25                    Maggie Roger
26                     Nicholas Da
27                  