In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import time
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2018-04-28 01:52:06


In [2]:
input_dir = 'data/'

for x in glob.glob(input_dir + "*"):
    print(x)

data/songs.csv
data/test.csv
data/members.csv
data/train.csv
data/song_extra_info.csv


In [112]:
df_train = pd.read_csv(input_dir + "train.csv")
df_test  = pd.read_csv(input_dir + 'test.csv')
df_songs = pd.read_csv(input_dir + 'songs.csv')
df_song_extra = pd.read_csv(input_dir + "song_extra_info.csv")
df_members = pd.read_csv(input_dir + "members.csv", parse_dates=["registration_init_time","expiration_date"])

In [113]:
# convert isrc information to year
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
    
df_song_extra['song_year'] = df_song_extra['isrc'].apply(isrc_to_year)
df_song_extra.drop(['isrc', 'name'], axis = 1, inplace = True)
# 1000 <=> 1s
df_songs['song_length'] /= 1000.0

In [114]:
# left join train and song based on song_id
# left join train and song_extra based on song_id

df_train = df_train.merge(df_songs, how="left", on="song_id")
df_train = df_train.merge(df_song_extra, how='left', on='song_id')
df_original = df_train

In [105]:
# # change object columns to str type
# catagorical_cols = []
# numerical_cols = []

# for col in df_train.columns:
#     if df_train[col].dtype == 'object':
#         catagorical_cols.append(col)
#         df_train[col] = df_train[col].astype('str')
#     else:
#         numerical_cols.append(col)
# print('catagorical columns: {}'.format(catagorical_cols))
# print('numerical columns: {}'.format(numerical_cols))

catagorical columns: ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type', 'genre_ids', 'artist_name', 'composer', 'lyricist']
numerical columns: ['target', 'song_length', 'language', 'song_year']


In [52]:
# 359966 unique users
# 7377418 song user pairs
print(df_train['msno'].shape)
print(df_train['msno'].unique().shape)

(7377418,)
(30755,)


Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language', 'song_year'],
      dtype='object')

## Deal with NaN
- catagorical -> fill in "Unknown" 
- numerical -> fill in mean value

In [116]:
for col in df_train.columns:
    if df_train[col].isnull().any():
        print('{:10} {}'.format(str(df_train[col].dtype), col))

object     source_system_tab
object     source_screen_name
object     source_type
float64    song_length
object     genre_ids
object     artist_name
object     composer
object     lyricist
float64    language
float64    song_year


In [118]:
# fill categorical columns with tag: 'Unknown'
UNKNOWN = 'Unknown'
col_catagorical = ['source_system_tab', 'source_screen_name', 'source_type', 'genre_ids', 'artist_name', 'composer', 'lyricist']
for col in col_catagorical:
    df_train[col].fillna(value=UNKNOWN, inplace=True)
    
fill_in_value = df_train['song_length'].mean()
df_train['song_length'].fillna(value=fill_in_value,inplace=True)

fill_in_value = df_train['song_year'].median()
df_train['song_year'].fillna(value=fill_in_value, inplace=True)

## Feature Engineering
- binary features
- count features
- historical features

In [123]:
# clean song lengths greater than 1800
df_train.loc[df_train['song_length'] > 1800, 'song_length'] = 1800

# count the number of genres
def _count(x):
    if x == UNKNOWN:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';', '、', ','])) + 1

# new feature: the number of genres for each song
df_train['genre_count'] = df_train['genre_ids'].apply(_count).astype(int)

# new feature: the number of lyricists for each song
df_train['lyricist_count'] = df_train['lyricist'].apply(_count).astype(int)

# new feature: the number of composers for each song
df_train['composer_count'] = df_train['composer'].apply(_count).astype(int)

# the number of artists
def artist_count(x):
    if x == UNKNOWN:
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&') + 1
    
# new feature: the number of artists for each song
df_train['artist_count'] = df_train['artist_name'].apply(artist_count).astype(np.int8)

In [124]:
# if the artist name includes 'is_featured'
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0
df_train['is_featured'] = df_train['artist_name'].apply(is_featured).astype(np.int8)

# if artist is same as composer
df_train['artist_composer'] = (df_train['artist_name'] == df_train['composer']).astype(np.int8)

# if artist, lyricist and composer are all three same
df_train['artist_composer_lyricist'] = ((df_train['artist_name'] == df_train['composer']) 
                                        & (df_train['artist_name'] == df_train['lyricist']) 
                                        & (df_train['composer'] == df_train['lyricist'])).astype(np.int8)

# if song language is 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0
df_train['song_lang_boolean'] = df_train['language'].apply(song_lang_boolean).astype(np.int8)

# if the song's length is shorter than mean
_mean_song_length = np.mean(df_train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0
df_train['smaller_song'] = df_train['song_length'].apply(smaller_song).astype(np.int8)

In [135]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in df_train['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        return 0
df_train['count_song_played'] = df_train['song_id'].apply(count_song_played).astype(np.int64)
del _dict_count_song_played_train

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in df_train['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        return 0
df_train['count_artist_played'] = df_train['artist_name'].apply(count_artist_played).astype(np.int64)
del _dict_count_artist_played_train

In [147]:
# check how many tasks we have
temp = df_train.groupby('msno').count()
temp.loc[temp['song_id']>100].shape

## Construct numpy data matrices
catagorical columns
- source_system_tab
- source_screen_name
- source_type
- genre_ids
- artist_name
- composer
- lyricist

In [200]:
# find all catagorical columns
catagorical_cols = []
for col in df_train.columns:
    if df_train[col].dtype == 'object' and col != 'msno' and col != 'song_id':
        catagorical_cols.append(col)
print(catagorical_cols)

# convert all object types to str
for col in catagorical_cols:
    df_train[col] = df_train[col].astype('str')

['source_system_tab', 'source_screen_name', 'source_type', 'genre_ids', 'artist_name', 'composer', 'lyricist']


In [253]:
# find all numerical columns
numerical_cols = [x for x in df_train.columns if x not in ['msno', 'song_id', 'target'] and x not in catagorical_cols]
print(numerical_cols)

# convert all numerical types to float
for col in numerical_cols:
    df_train[col] = df_train[col].astype('float')
    
df_train['target'] = df_train['target'].astype('float')

['song_length', 'language', 'song_year', 'genre_count', 'lyricist_count', 'composer_count', 'artist_count', 'is_featured', 'artist_composer', 'artist_composer_lyricist', 'song_lang_boolean', 'smaller_song', 'count_song_played', 'count_artist_played']


In [204]:
# frop song_id column which is not useful
df_train.drop(['song_id'], axis=1, inplace=True)

In [322]:
sum = 0

# print number of unique numbers
for col in catagorical_cols:
    sum += df_train[col].unique().shape[0]
    print('{:20}  {}'.format(col, df_train[col].unique().shape[0]))

# overall dimension of a single data point
# categorical dimension + numerical dimension
dimension = sum + 14
print('{:20} {}'.format('overall dimension:', dimension))

source_system_tab     9
source_screen_name    20
source_type           13
genre_ids             573
artist_name           40583
composer              76064
lyricist              33888
overall dimension:   151164


### Fit encoders for all catagorical data

In [286]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = {}
onehot_encoder = {}
for col in catagorical_cols:
    label_encoder[col] = LabelEncoder()
    label_encoder[col].fit(df_train[col].unique())
    label_encoded = label_encoder[col].transform(df_train[col].unique())
    onehot_encoder[col] = OneHotEncoder(sparse=False)
    onehot_encoder[col].fit(label_encoded.reshape(-1, 1))

def get_onehot_encoded(group, col):
    label_encoded = label_encoder[col].transform(group[col]).reshape(-1, 1)
    onehot_encoded = onehot_encoder[col].transform(label_encoded)
    return onehot_encoded

In [331]:
grouped = df_train.groupby('msno')
grouped_count = grouped.count()

num_users = df_train['msno'].unique().shape[0]
print('\number of total users {}'.format(num_users))
num_users_rich = grouped_count.loc[grouped_count['source_system_tab']>=1000].shape[0]
print('number of users with more than 1000 data: ', num_users_rich)


umber of total users 30755
number of users with more than 1000 data:  932


### Find user groups with enough data samples

In [342]:
import pickle
from scipy import sparse

data = []
gt = []
# each data point is under the following format
# [0:9] source_system_tab
# [9:29] source_screen_name
# [29:42] source_type
# [42:615] genre_ids
# [615:41198] artist_name
# [41198:117262] composer
# [117262:151150] lyricist
# [151150: 151164] 14 numerical columns
count = 0
for name, group in grouped:
    if group.shape[0] >= 1000:
        # user data matrix
        user_data = np.zeros((group.shape[0], 1))
        for col in catagorical_cols:
            user_data = np.concatenate([user_data, get_onehot_encoded(group, col)], axis=1)
        for col in numerical_cols:
            user_data = np.concatenate([user_data, group[col].values.reshape(-1, 1)], axis=1)
        user_data = np.delete(user_data, 0, axis=1)
        # user groud truth matrix
        user_gt = group['target']
        
        # convert to sparse matrices
        user_data = sparse.csr_matrix(user_data)
        user_gt = sparse.csr_matrix(user_gt)
        
        data.append(user_data)
        gt.append(user_gt)
        count += 1
        print("user ID: {}, data shape: {}".format(name, user_data.shape))
        if count == 1:
            break

user ID: +22Q6EpFwjgJhiiGWz7GQUiq5yu0adEEZWH8j/fj19w=, data shape: (1006, 151164)


### save to pickle data files

In [339]:
with open('./data.data', 'bw') as f:
    pickle.dump(data, f)
with open('./gt.data', 'bw') as f:
    pickle.dump(gt, f)

In [341]:
print(user_data.shape)
print(user_gt.shape)
user_data

(1006, 151164)
(1, 1006)


<1006x151164 sparse matrix of type '<class 'numpy.float64'>'
	with 15925 stored elements in Compressed Sparse Row format>

## Deal with artists

In [190]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from numpy import argmax

In [188]:
artists = df_train['artist_name'].unique()
print(artists)

['Bastille' 'Various Artists' 'Nas' ... 'JD Samson & MEN'
 '2002 Latin Love Songs' 'Salvina y Miren al Lobo']


In [193]:
# integer encoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(artists)
print(integer_encoded)

# one-hot encoder
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

# # invert first example
# inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
# print(inverted)

[ 3277 31961 21372 ... 13653   187 25747]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['Bastille']


In [264]:
from scipy.sparse import lil_matrix
A = lil_matrix((1000, 1000))
A[0, :100] = np.random.rand(100)
A[1, 100:200] = A[0, :100]
A.setdiag(np.random.rand(1000))


B = np.zeros((10000, 1000))
B[:1000, :1000] = A.toarray()
B

array([[0.55962745, 0.68493498, 0.5639901 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.73142811, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.98572975, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [313]:
a = np.random.rand(2, 3)
n = np.random.rand(2, 10)
np.concatenate([a, n], axis=1).shape
np.concatenate([a])


array([[0.25990086, 0.59799831, 0.2493278 ],
       [0.95983736, 0.47735211, 0.21574678]])

In [314]:
np.delete(a, 1, axis=1)

array([[0.25990086, 0.2493278 ],
       [0.95983736, 0.21574678]])