In [2]:
# https://www.kaggle.com/asmitavikas/feature-engineered-0-68310
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math

In [3]:
import os
cwd = os.getcwd()
print("Current directory: %s" % cwd)

print('Loading data...')
data_path = '../../kaggle_data/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv',
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
print('Done loading...')

Current directory: /home/jiacheliu3/Kaggle/kkbox-music-recommendation/data
Loading data...
Done loading...


In [4]:
print('Member table:')
print(members.info())

print('Song table:')
print(songs.info())

print('Song extra info table:')
print(songs_extra.info())

Member table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
msno                      34403 non-null object
city                      34403 non-null int64
bd                        34403 non-null int64
gender                    14501 non-null object
registered_via            34403 non-null int64
registration_init_time    34403 non-null datetime64[ns]
expiration_date           34403 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(3), object(2)
memory usage: 1.8+ MB
None
Song table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        object
song_length    int64
genre_ids      object
artist_name    object
composer       object
lyricist       object
language       float64
dtypes: float64(1), int64(1), object(5)
memory usage: 122.6+ MB
None
Song extra info table:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295971 entries, 0 to 2295970
Data 

# Engineering members table

In [5]:
members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(np.int16)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day

# Engineering songs table

In [6]:
# Convert to lower case
# NaN values are kept for replacement
def to_lower_case(X, list_of_columns):
    for col in list_of_columns:
        print("Converting column %s" % col)
        X[col]=X[col].str.lower()

In [7]:
# Song length
MEAN_SONG_LENGTH = int(round(np.mean(songs['song_length'])))
STD_SONG_LENGTH = int(round(np.std(songs['song_length'])))
print("Song length mean: %i  std: %i" % (MEAN_SONG_LENGTH, STD_SONG_LENGTH))

Song length mean: 246993  std: 160920


In [8]:
# Fill with mean length
songs.song_length.fillna(MEAN_SONG_LENGTH,inplace=True)

In [9]:
def shorter_song(x):
    if x < MEAN_SONG_LENGTH:
        return 1
    return 0

songs['short_song'] = songs['song_length'].apply(shorter_song).astype(np.int8)
songs['short_song'] = songs['song_length'].apply(shorter_song).astype(np.int8)

In [10]:
# How many std is the song length from mean
def mean_length_dist(x):
    return x-MEAN_SONG_LENGTH

songs['mean_length_distance'] = songs['song_length'].apply(shorter_song).astype(np.uint16)
songs['mean_length_distance'] = songs['song_length'].apply(shorter_song).astype(np.uint16)

In [11]:
# Cast to lower letters
cols_to_lower=['artist_name', 'composer', 'lyricist']
to_lower_case(songs, cols_to_lower)

Converting column artist_name
Converting column composer
Converting column lyricist


# Engineering songs_extra table

In [12]:
'''
ISRC is in the form "CC-XXX-YY-NNNNN"
CC: Country code
XXX: Issuer
YY: Year
NNNNN: Unique identifier
'''
def split_isrc(x):
    if type(x) == str:
        country=x[0:2]
        issuer=x[2:5]
        if int(x[5:7]) > 17:
            year=1900 + int(x[5:7])
        else:
            year=2000 + int(x[5:7])
        unique_code=x[7:12]
        
        return country, issuer, year, unique_code
    else:
        return "empty", "empty", np.NaN, "empty"  
    


In [13]:
# Split isrc into 4 columns
S=songs_extra['isrc'].apply(split_isrc).to_frame()
songs_extra[['country_code', 'issuer', 'issue_year' ,'unique_id']]=pd.DataFrame(S['isrc'].tolist())

In [14]:
# Safely drop isrc
songs_extra.drop(['isrc'], axis=1, inplace=True)

In [15]:
# Rename column 'name' to 'song_name' to avoid confusion
songs_extra.rename(columns={"name": "song_name"}, inplace=True)

# Merge tables

In [16]:
# Merge songs
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

# Merge members
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

# Merge songs_extra
train = train.merge(songs_extra, on = 'song_id', how = 'left')
test = test.merge(songs_extra, on = 'song_id', how = 'left')

In [17]:
# List current columns
print(train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 31 columns):
msno                      object
song_id                   object
source_system_tab         object
source_screen_name        object
source_type               object
target                    int64
song_length               float64
genre_ids                 object
artist_name               object
composer                  object
lyricist                  object
language                  float64
short_song                float64
mean_length_distance      float64
city                      int64
bd                        int64
gender                    object
registered_via            int64
registration_init_time    datetime64[ns]
expiration_date           int64
membership_days           int16
registration_year         int64
registration_month        int64
registration_date         int64
expiration_year           int64
expiration_month          int64
song_name                 ob

In [17]:
# Convert to categorical
def to_categorical(X):
    print("Before convert to categorical")
    print(X.info())
    X_cat = pd.concat([
        X.select_dtypes([], ['object']),
        X.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex_axis(X.columns, axis=1)
    print("After convertion")
    print(X_cat.info())
    return X_cat

In [18]:
'''
    Strings will be hashed before fed to Tensorflow
    Thus no need to convert to categorical
'''

# # Convert to categorical
# train_cat=to_categorical(train)
# test_cat=to_categorical(test)

'\n    Strings will be hashed before fed to Tensorflow\n    Thus no need to convert to categorical\n'

In [23]:
# Remove newline characters from string fields
def replace_if_string(x):
    if type(x) is str:
        return x.replace('\n', '')
    else:
        return x

def remove_newline(X):
    X_nobreak = pd.concat([
        X.select_dtypes([], ['object']),
        X.select_dtypes(['object']).apply(lambda M: M.apply(lambda s: replace_if_string(s)))
        ], axis=1).reindex_axis(X.columns, axis=1)
    print("After removing newline")
    print(X_nobreak.info())
    return X_nobreak

In [24]:
# Remove newline from all columns
train=remove_newline(train)
test=remove_newline(test)

After removing newline
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 31 columns):
msno                      object
song_id                   object
source_system_tab         object
source_screen_name        object
source_type               object
target                    int64
song_length               float64
genre_ids                 object
artist_name               object
composer                  object
lyricist                  object
language                  float64
short_song                float64
mean_length_distance      float64
city                      int64
bd                        int64
gender                    object
registered_via            int64
registration_init_time    datetime64[ns]
expiration_date           int64
membership_days           int16
registration_year         int64
registration_month        int64
registration_date         int64
expiration_year           int64
expiration_month          int64
song_

In [25]:
# Output to csv
train.to_csv(data_path + "train_engineered.csv")
test.to_csv(data_path + "test_engineered.csv")

Phase 1 feature engineering:
0.66770 without cv

# # Phase 2 feature engineering

In [20]:
# Copy to new start
train_2=train.copy()
test_2=test.copy()

In [21]:
train_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 31 columns):
msno                      object
song_id                   object
source_system_tab         object
source_screen_name        object
source_type               object
target                    int64
song_length               float64
genre_ids                 object
artist_name               object
composer                  object
lyricist                  object
language                  float64
short_song                float64
mean_length_distance      float64
city                      int64
bd                        int64
gender                    object
registered_via            int64
registration_init_time    datetime64[ns]
expiration_date           int64
membership_days           int16
registration_year         int64
registration_month        int64
registration_date         int64
expiration_year           int64
expiration_month          int64
song_name                 ob

In [22]:
# GC old df
import gc
del train, test
del songs, members, songs_extra
gc.collect()

87

# Fille empty in genre_id/artist/composer/lyricist 

In [23]:
NO_GENRE_FILL="empty"
def genre_id_count(x):
    if x == NO_GENRE_FILL:
        return 0
    else:
        return x.count('|') + 1

In [24]:
# this add_categories can only be run once
train_2['genre_ids'].fillna(NO_GENRE_FILL,inplace=True)
test_2['genre_ids'].fillna(NO_GENRE_FILL,inplace=True)
train_2['genre_ids_count'] = train_2['genre_ids'].apply(genre_id_count).astype(np.int8)
test_2['genre_ids_count'] = test_2['genre_ids'].apply(genre_id_count).astype(np.int8)

In [25]:
NO_LYRICIST_FILL="empty"
def lyricist_count(x):
    if x == NO_LYRICIST_FILL:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

In [26]:
# this add_categories can only be run once
train_2['lyricist'].fillna(NO_LYRICIST_FILL,inplace=True)
test_2['lyricist'].fillna(NO_LYRICIST_FILL,inplace=True)
train_2['lyricists_count'] = train_2['lyricist'].apply(lyricist_count).astype(np.int8)
test_2['lyricists_count'] = test_2['lyricist'].apply(lyricist_count).astype(np.int8)

In [27]:
NO_COMPOSER_FILL="empty"
def composer_count(x):
    if x == NO_COMPOSER_FILL:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

In [28]:
# this add_categories can only be run once
train_2['composer'].fillna(NO_COMPOSER_FILL,inplace=True)
test_2['composer'].fillna(NO_COMPOSER_FILL,inplace=True)
train_2['composer_count'] = train_2['composer'].apply(composer_count).astype(np.int8)
test_2['composer_count'] = test_2['composer'].apply(composer_count).astype(np.int8)

In [29]:
NO_ARTIST_FILL='empty'
    
def artist_count(x):
    if x == NO_ARTIST_FILL:
        return 0
    else:
        count = x.count(',') + x.count('&') + x.count('+') + x.count('|')
        # Add English separators
        count += x.count('and') + x.count('feat')
        # Add Chinese separators
        count += x.count('，') + x.count('、') + x.count('＆') + x.count('＋')
        
        return count

In [30]:
train_2['artist_name'].fillna(NO_ARTIST_FILL,inplace=True)
test_2['artist_name'].fillna(NO_ARTIST_FILL,inplace=True)
train_2['artist_count'] = train_2['artist_name'].apply(artist_count).astype(np.int8)
test_2['artist_count'] = test_2['artist_name'].apply(artist_count).astype(np.int8)

# Check if the song is featured/acoustic/instrumental/remix

In [33]:
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

In [34]:
# Find if the song is featured
train_2['is_featured_artist'] = train_2['artist_name'].apply(is_featured).astype(np.int8)
test_2['is_featured_artist'] = test_2['artist_name'].apply(is_featured).astype(np.int8)

In [35]:
# Find if the song is featured by song name
train_2['is_featured_song'] = train_2['song_name'].apply(is_featured).astype(np.int8)
test_2['is_featured_song'] = test_2['song_name'].apply(is_featured).astype(np.int8)

In [36]:
# Find if the song is featured
train_2['is_featured'] = train_2['is_featured_artist'] | train_2['is_featured_song']
test_2['is_featured'] = test_2['is_featured_artist'] | test_2['is_featured_song']

In [37]:
# Look at artist names that contains 'feat'
check_feat=train_2.loc[train_2['is_featured'] == 1]
check_feat[['artist_name', 'song_name']]

Unnamed: 0,artist_name,song_name
12,林俊傑 (jj lin),手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋
53,謝和弦 (r-chord),在沒有你以後 (feat. 張智成) (Without You)
55,林俊傑 (jj lin),手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋
137,劉佳,愛很美-feat.sara
180,david guetta,Hey Mama (feat. Nicki Minaj| Bebe Rexha & Afro...
184,major lazer,Cold Water (feat. Justin Bieber & MØ)
186,major lazer,Lean On (feat. MØ & DJ Snake)
187,david guetta,Titanium (feat. Sia)
207,alex goot feat. kurt hugo schneider & atc,Let Me Love You
414,謝和弦 (r-chord),在沒有你以後 (feat. 張智成) (Without You)


In [38]:
# Find the main and featuring artist names
def feat_name(x):
    if 'feat.' in str(x) :
        idx=x.find('feat.')
        feat_name=x[idx+5:].strip()
        return feat_name
    elif 'feat' in str(x) :
        idx=x.find('feat')
        feat_name=x[idx+5:].strip()
        return feat_name
    return "empty"
def main_name(x):
    if 'feat' in str(x) :
        idx=x.find('feat')
        main_name=x[:idx].strip()
        return main_name
    return x

In [40]:
# Split featuring artist names
train_2['feat_artist'] = train_2['artist_name'].apply(feat_name)
test_2['feat_artist'] = test_2['artist_name'].apply(feat_name)

train_2['main_artist'] = train_2['artist_name'].apply(main_name)
test_2['main_artist'] = test_2['artist_name'].apply(main_name)

# Split featuring song names
train_2['feat_song_name'] = train_2['song_name'].apply(feat_name)
test_2['feat_song_name'] = test_2['song_name'].apply(feat_name)

train_2['main_song_name'] = train_2['song_name'].apply(main_name)
test_2['main_song_name'] = test_2['song_name'].apply(main_name)


In [41]:
def is_remix(x):
    if 'remix' in str(x):
        return 1
    return 0

In [42]:
train_2['is_remix_artist']=train_2['artist_name'].apply(is_remix).astype(np.int8)
test_2['is_remix_artist']=test_2['artist_name'].apply(is_remix).astype(np.int8)

train_2['is_remix_song']=train_2['song_name'].apply(is_remix).astype(np.int8)
test_2['is_remix_song']=test_2['song_name'].apply(is_remix).astype(np.int8)

train_2['is_remix'] = train_2['is_remix_artist'] | train_2['is_remix_song']
test_2['is_remix'] = test_2['is_remix_artist'] | test_2['is_remix_song']

In [43]:
# Look at artist names that contains 'remix'
check_remix=train_2.loc[train_2['is_remix'] == 1]
check_remix[['artist_name','song_name']]

Unnamed: 0,artist_name,song_name
2901,dj remix factory,Where Are U Now (DJ Remixed)
12699,nowhere樂團 (nowhere band),1984 remix special
18877,蔡依林 (jolin tsai),MR.Q（All Night Party remixed by DJ Submarine） ...
26418,群星 (remix 101),男人KTV (Karaoke Men)
30596,various artists (bhangra remixes),The Fox (What Does The Fox Say?)
37821,g-dragon,THIS LOVE (FEAT.G.H remix)
38495,ayumi hamasaki (浜崎あゆみ),Feel the love 敢愛 (DJ Hello Kitty remix)
38607,dj remix factory,Payphone
39552,seoan,What is Right? (Downtempo remix)
49608,billboard masters,Dessert (remix) - Tribute to Dawin and Silento


In [44]:
def is_live(x):
    live_keywords=[' live', '现场', '現場', '演唱會', '演唱会', '演出']
    # some sings name start with 'live'
    if any(s in str(x) for s in live_keywords):
        return 1
    return 0

In [45]:

train_2['is_live_artist']=train_2['artist_name'].apply(is_live).astype(np.int8)
test_2['is_live_artist']=test_2['artist_name'].apply(is_live).astype(np.int8)

train_2['is_live_song']=train_2['song_name'].apply(is_live).astype(np.int8)
test_2['is_live_song']=test_2['song_name'].apply(is_live).astype(np.int8)

train_2['is_live'] = train_2['is_live_artist'] | train_2['is_live_song']
test_2['is_live'] = test_2['is_live_artist'] | test_2['is_live_song']

In [46]:
# Look at artist names that contains 'live'
check_live=train_2.loc[train_2['is_live'] == 1]
check_live[['artist_name', 'song_name']]

Unnamed: 0,artist_name,song_name
2380,五月天 (mayday),諾亞方舟【2012世界巡迴演唱會「諾亞方舟」主題曲】
8137,郭靜 (claire kuo),放肆一下 (Wild Things) - <Fun 4 一夏>福茂女朋友 演唱會主題歌
10058,五月天 (mayday),諾亞方舟【2012世界巡迴演唱會「諾亞方舟」主題曲】
10204,五月天 (mayday),如煙+如果還有明天 【五月天諾亞方舟世界巡迴演唱會LIVE版】
11520,五月天 (mayday),諾亞方舟【2012世界巡迴演唱會「諾亞方舟」主題曲】
13553,張國榮 (leslie cheung),我(熱情演唱會壓軸主題曲)
16676,五月天 (mayday),愛情萬歲【五月天諾亞方舟世界巡迴演唱會LIVE版】
16681,五月天 (mayday),2012【五月天諾亞方舟世界巡迴演唱會LIVE版】
23907,五月天 (mayday),諾亞方舟【2012世界巡迴演唱會「諾亞方舟」主題曲】
28772,五月天 (mayday),諾亞方舟【2012世界巡迴演唱會「諾亞方舟」主題曲】


In [47]:
def is_acoustic(x):
    acoustic_keywords=['acoustic', '原声', '原聲']
    # some sings name start with 'live'
    if any(s in str(x) for s in acoustic_keywords):
        return 1
    return 0

In [48]:
train_2['is_acoustic_artist']=train_2['artist_name'].apply(is_acoustic).astype(np.int8)
test_2['is_acoustic_artist']=test_2['artist_name'].apply(is_acoustic).astype(np.int8)

train_2['is_acoustic_song']=train_2['song_name'].apply(is_acoustic).astype(np.int8)
test_2['is_acoustic_song']=test_2['song_name'].apply(is_acoustic).astype(np.int8)

train_2['is_acoustic'] = train_2['is_acoustic_artist'] | train_2['is_acoustic_song']
test_2['is_acoustic'] = test_2['is_acoustic_artist'] | test_2['is_acoustic_song']

In [49]:
# Look at artist names that contains 'remix'
check_acoustic=train_2.loc[train_2['is_acoustic'] == 1]
check_acoustic[['artist_name', 'song_name']]

Unnamed: 0,artist_name,song_name
103,相愛穿梭千年 電視原聲帶,相愛不能見
112,16個夏天 電視原聲帶,浪漫來襲 (Romance Strikes)
144,生日快樂電影原聲帶,I MISS YOU
146,生日快樂電影原聲帶,生日快樂
149,生日快樂電影原聲帶,回憶
213,16個夏天 電視原聲帶,公轉自轉 (Gong Zhuan Zi Zhuan)
467,沒關係 是愛情啊 電視原聲帶 volume 1,最佳的幸運 (Sung by CHEN(EXO))
606,16個夏天 電視原聲帶,以後別做朋友 (Yi Hou Bie Zuo Peng You)
662,22k夢想高飛 電視原聲帶,藏不住
855,我的祕密飯店 電視原聲帶,Secret Killer (Inst.) - Studio Musicians


In [50]:
def is_instrumental(x):
    intru_keywords=['instrumental', 'インスト', '配樂', '配乐', '伴奏']
    # some sings name start with 'live'
    if any(s in str(x) for s in intru_keywords):
        return 1
    return 0

In [51]:
train_2['is_instrumental_artist']=train_2['artist_name'].apply(is_instrumental).astype(np.int8)
test_2['is_instrumental_artist']=test_2['artist_name'].apply(is_instrumental).astype(np.int8)

train_2['is_instrumental_song']=train_2['song_name'].apply(is_instrumental).astype(np.int8)
test_2['is_instrumental_song']=test_2['song_name'].apply(is_instrumental).astype(np.int8)

train_2['is_instrumental'] = train_2['is_instrumental_artist'] | train_2['is_instrumental_song']
test_2['is_instrumental'] = test_2['is_instrumental_artist'] | test_2['is_instrumental_song']

In [52]:
# Look at artist names that contains 'instrumental'
check_instrumental=train_2.loc[train_2['is_instrumental'] == 1]
check_instrumental[['artist_name', 'song_name']]

Unnamed: 0,artist_name,song_name
22075,piano music for christmas & instrumental chris...,On a December Night - Violin and Piano Xmas Carol
22076,piano music for christmas & instrumental chris...,Merry Christmas - Solo Piano Music
22077,piano music for christmas & instrumental chris...,The Gift of Christmas
24964,黃慧音,藥師灌頂真言(梵音唱誦-鋼琴伴奏版)
40183,various artists,Love Is… (輕快漫畫配樂版) (金嘆與恩尚的心動之歌)
41854,the art of instrumental,Concerto A Cinque: I. Allegro moderato
46749,the art of instrumental,Sinfonie D-Dur: III. Presto
46750,the art of instrumental,Sarabande: 2. Satz
50677,[逆轉勝] 五月天∕怪獸 原聲原創紀 ([second chance] soundtrack...,光榮 【電影[逆轉勝]配樂】
53031,ジブリサウンドトラック,風のとおり道 (インストゥルメンタル)


# Check if artist/composer/lyricist is the same person

In [54]:
# if artist is same as composer
train_2['artist_is_composer'] = (train_2['main_artist'] == train_2['composer']).astype(np.int8)
test_2['artist_is_composer'] = (test_2['main_artist'] == test_2['composer']).astype(np.int8)

# if artist, lyricist and composer are all three same
train_2['artist_is_composer_is_lyricist'] = ((train_2['main_artist'] == train_2['composer']) & (train_2['main_artist'] == train_2['lyricist'])).astype(np.int8)
test_2['artist_is_composer_is_lyricist'] = ((test_2['main_artist'] == test_2['composer']) & (test_2['main_artist'] == test_2['lyricist'])).astype(np.int8)

# Language related

In [55]:
train_2.language.unique()

array([ 52.,  -1.,  31.,   3.,  17.,  10.,  24.,  59.,  45.,  nan,  38.])

In [56]:
train_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 56 columns):
msno                              object
song_id                           object
source_system_tab                 object
source_screen_name                object
source_type                       object
target                            int64
song_length                       float64
genre_ids                         object
artist_name                       object
composer                          object
lyricist                          object
language                          float64
short_song                        float64
mean_length_distance              float64
city                              int64
bd                                int64
gender                            object
registered_via                    int64
registration_init_time            datetime64[ns]
expiration_date                   int64
membership_days                   int16
registration_year   

In [57]:
'''
    Conclusion: 
        3.0: Chinese
        52.0: English
        17.0: Japanese
        45.0: Thai
        10.0: Taiwanese
        24.0: Cantonese
        38.0: Hindi
        59.0: Seems to be Chinese
        31.0: Korean
'''
train_2.loc[train_2['language'] == 31.0][['artist_name','song_name','language']]

Unnamed: 0,artist_name,song_name,language
6,bigbang taeyang,眼| 鼻| 口 (Eyes| Nose| Lips),31.0
15,various artists,You Are My Everything - Gummy,31.0
56,exo,LUCKY,31.0
57,exo,Run,31.0
59,exo,LIGHTSABER,31.0
77,ftisland,Gobaeghabnida,31.0
81,ftisland,Stand By Me,31.0
86,taeyeon,Secret,31.0
88,taeyeon,Rain,31.0
202,劉在錫 x exo,Dancing King,31.0


In [58]:
# Magic feature
# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train_2['song_lang_magic'] = train_2['language'].apply(song_lang_boolean).astype(np.int8)
test_2['song_lang_magic'] = test_2['language'].apply(song_lang_boolean).astype(np.int8)

In [59]:
def merge_dicts(dict0, dict1):
    for k in dict1:
        if k in dict0:
            dict0[k]+=dict1[k]
        else:
            dict0[k]=dict1[k]
    return dict0

def find_item(dict, x):
    if x in dict:
        return dict[x]
    return 0

In [60]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train_2['unique_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test_2['unique_id'].value_counts().iteritems()}
_dict_count_song_played=merge_dicts(_dict_count_song_played_train, _dict_count_song_played_test)

# Remove empty
_dict_count_song_played.pop('empty', None)

train_2['count_song_played'] = train_2['unique_id'].apply(lambda x: find_item(_dict_count_song_played, x)).astype(np.int16)
test_2['count_song_played'] = test_2['unique_id'].apply(lambda x: find_item(_dict_count_song_played, x)).astype(np.int16)

In [62]:
# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train_2['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test_2['artist_name'].value_counts().iteritems()}
_dict_count_artist_played=merge_dicts(_dict_count_artist_played_train, _dict_count_artist_played_test)

# Remove empty
_dict_count_artist_played.pop(NO_ARTIST_FILL, None)

train_2['count_artist_played'] = train_2['artist_name'].apply(lambda x: find_item(_dict_count_artist_played, x)).astype(np.int16)
test_2['count_artist_played'] = test_2['artist_name'].apply(lambda x: find_item(_dict_count_artist_played, x)).astype(np.int16)


In [63]:
# number of times the composer has been played
_dict_count_composer_played_train = {k: v for k, v in train_2['composer'].value_counts().iteritems()}
_dict_count_composer_played_test = {k: v for k, v in test_2['composer'].value_counts().iteritems()}
_dict_count_composer_played=merge_dicts(_dict_count_composer_played_train, _dict_count_composer_played_test)

# Remove empty
_dict_count_composer_played.pop(NO_COMPOSER_FILL, None)

train_2['count_composer_played'] = train_2['composer'].apply(lambda x: find_item(_dict_count_composer_played, x)).astype(np.int16)
test_2['count_composer_played'] = test_2['composer'].apply(lambda x: find_item(_dict_count_composer_played, x)).astype(np.int16)

In [64]:
# number of times the lyricist has been played
_dict_count_lyricist_played_train = {k: v for k, v in train_2['lyricist'].value_counts().iteritems()}
_dict_count_lyricist_played_test = {k: v for k, v in test_2['lyricist'].value_counts().iteritems()}
_dict_count_lyricist_played=merge_dicts(_dict_count_lyricist_played_train, _dict_count_lyricist_played_test)

# Remove empty
_dict_count_lyricist_played.pop(NO_LYRICIST_FILL, None)

train_2['count_lyricist_played'] = train_2['lyricist'].apply(lambda x: find_item(_dict_count_lyricist_played, x)).astype(np.int16)
test_2['count_lyricist_played'] = test_2['lyricist'].apply(lambda x: find_item(_dict_count_lyricist_played, x)).astype(np.int16)

In [65]:
# number of times the member performs an action
_dict_count_member_train = {k: v for k, v in train_2['msno'].value_counts().iteritems()}
_dict_count_member_test = {k: v for k, v in test_2['msno'].value_counts().iteritems()}
_dict_count_member=merge_dicts(_dict_count_member_train, _dict_count_member_test)

# Seems no empty slots

train_2['count_member_action'] = train_2['msno'].apply(lambda x: find_item(_dict_count_member, x)).astype(np.int16)
test_2['count_member_action'] = test_2['msno'].apply(lambda x: find_item(_dict_count_member, x)).astype(np.int16)

In [66]:
# Average times a member plays a song per registered day
train_2['member_action_per_day'] = train_2['count_member_action']/train_2['membership_days']
test_2['member_action_per_day'] = test_2['count_member_action']/test_2['membership_days']

End of phase 2


In [67]:
train_2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 63 columns):
msno                              object
song_id                           object
source_system_tab                 object
source_screen_name                object
source_type                       object
target                            int64
song_length                       float64
genre_ids                         object
artist_name                       object
composer                          object
lyricist                          object
language                          float64
short_song                        float64
mean_length_distance              float64
city                              int64
bd                                int64
gender                            object
registered_via                    int64
registration_init_time            datetime64[ns]
expiration_date                   int64
membership_days                   int16
registration_year   

In [68]:
# Output to csv
train_2.to_csv(data_path + "train_engineered.csv")
test_2.to_csv(data_path + "test_engineered.csv")

# Phase 3

In [71]:
train_3=train_2.copy()
test_3=test_2.copy()

In [101]:
del train_2, test_2
gc.collect()

720

In [72]:
# One hot-convert genre ids
GENRE_SET=set([])

def collect_genre(x):
    parts=x.split('|')
    global GENRE_SET
    siz=len(GENRE_SET)
    for s in parts:
        if s=='empty':
            return
        else:
            GENRE_SET.add(int(s))
    if len(GENRE_SET)>siz:
        print("Now %i genres" % len(GENRE_SET))

In [73]:
train_3['genre_ids'].apply(collect_genre)
test_3['genre_ids'].apply(collect_genre)

Now 1 genres
Now 2 genres
Now 3 genres
Now 4 genres
Now 5 genres
Now 6 genres
Now 7 genres
Now 8 genres
Now 9 genres
Now 10 genres
Now 11 genres
Now 12 genres
Now 13 genres
Now 14 genres
Now 15 genres
Now 16 genres
Now 18 genres
Now 19 genres
Now 20 genres
Now 23 genres
Now 24 genres
Now 25 genres
Now 26 genres
Now 27 genres
Now 28 genres
Now 29 genres
Now 30 genres
Now 31 genres
Now 35 genres
Now 36 genres
Now 38 genres
Now 39 genres
Now 40 genres
Now 42 genres
Now 43 genres
Now 44 genres
Now 45 genres
Now 46 genres
Now 47 genres
Now 48 genres
Now 50 genres
Now 51 genres
Now 53 genres
Now 54 genres
Now 55 genres
Now 56 genres
Now 57 genres
Now 58 genres
Now 59 genres
Now 60 genres
Now 61 genres
Now 62 genres
Now 63 genres
Now 64 genres
Now 65 genres
Now 66 genres
Now 67 genres
Now 68 genres
Now 69 genres
Now 70 genres
Now 71 genres
Now 72 genres
Now 73 genres
Now 74 genres
Now 75 genres
Now 76 genres
Now 77 genres
Now 78 genres
Now 79 genres
Now 80 genres
Now 81 genres
Now 82 genres
N

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
2556760    None
2556761    None
2556762    None
2556763    None
2556764    None
2556765    None
2556766    None
2556767    None
2556768    None
2556769    None
2556770    None
2556771    None
2556772    None
2556773    None
2556774    None
2556775    None
2556776    None
2556777    None
2556778    None
2556779    None
2556780    None
2556781    None
2556782    None
2556783    None
2556784    None
2556785    None
2556786    None
2556787    None
2556788    None
2556789    None
Name: genre_ids, Length:

In [84]:
def find_genre(x, g):
    return g in x

# For each genre generate one column whether the song has this genre
# Estimated 172 features to generate, it takes very long
i=1
for k in GENRE_SET:
    new_feature_name='genre_'+str(k)
    print("New column genre %i, %i/%i" % (k, i, len(GENRE_SET)))
    
    train_3[new_feature_name]=train_3['genre_ids'].apply(lambda x: find_genre(x, str(k))).astype(np.int8)
    test_3[new_feature_name]=train_3['genre_ids'].apply(lambda x: find_genre(x, str(k))).astype(np.int8)
    
    i+=1

New column genre 1026
New column genre 516
New column genre 2052
New column genre 1033
New column genre 2058
New column genre 1040
New column genre 2065
New column genre 531
New column genre 1047
New column genre 2072
New column genre 1054
New column genre 2079
New column genre 1568
New column genre 545
New column genre 1572
New column genre 1061
New column genre 2086
New column genre 1579
New column genre 1068
New column genre 2093
New column genre 2045
New column genre 2100
New column genre 1082
New column genre 2107
New column genre 2109
New column genre 1598
New column genre 1089
New column genre 2116
New column genre 1605
New column genre 1096
New column genre 1609
New column genre 2122
New column genre 1103
New column genre 1616
New column genre 2127
New column genre 2130
New column genre 1110
New column genre 87
New column genre 1117
New column genre 94
New column genre 95
New column genre 1630
New column genre 1633
New column genre 2144
New column genre 1124
New column genre 10

In [85]:
train_3['genre_212']

0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23         False
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
7377388    False
7377389    False
7377390    False
7377391    False
7377392    False
7377393    False
7377394    False
7377395    False
7377396    False
7377397    False
7377398    False
7377399    False
7377400    False
7377401    False
7377402    False
7377403    False
7377404    False
7377405    False
7377406    False
7377407    False
7377408    False
7377409    False
7377410    False
7377411    False
7377412    False
7377413    False
7377414    False
7377415    Fal

In [90]:
genre_names=["genre_"+str(x) for x in list(GENRE_SET)]

In [96]:
# Drop genre_ids
train_3.drop(['genre_ids'], inplace=True)

ValueError: labels ['genre_ids'] not contained in axis

In [None]:
# Output to file for later uses
train_3.to_csv(data_path+'train_genres.csv', columns=genre_names)
test_3.to_csv(data_path+'test_genres.csv', columns=genre_names)

In [97]:
train_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Columns: 235 entries, msno to genre_509
dtypes: bool(172), datetime64[ns](1), float64(6), int16(6), int64(10), int8(22), object(18)
memory usage: 3.7+ GB


In [98]:
train_3['genre_212']

0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23         False
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
7377388    False
7377389    False
7377390    False
7377391    False
7377392    False
7377393    False
7377394    False
7377395    False
7377396    False
7377397    False
7377398    False
7377399    False
7377400    False
7377401    False
7377402    False
7377403    False
7377404    False
7377405    False
7377406    False
7377407    False
7377408    False
7377409    False
7377410    False
7377411    False
7377412    False
7377413    False
7377414    False
7377415    Fal

# Special character handling

In [112]:
# Check the number of unique artist and song names
artist_names=set(train_3['artist_name'].unique()) | set(test_3['artist_name'].unique())
print("%i unique artist names" % len(artist_names))
del artist_names

song_names=set(train_3['song_name'].unique()) | set(test_3['song_name'].unique())
print("%i unique song names" % len(song_names))
del song_names

46286 unique artist names
269356 unique song names


In [99]:
def split_parenthesis(x):
    if '(' in str(x):
        pre=s=x[:x.find("(")].strip()
        mid=x[x.find("(")+1:x.find(")")].strip()
        post=x[x.rfind(")")+1:].strip()
    elif '（' in str(x):
        pre=x[:x.find("（")].strip()
        mid=x[x.find("（")+1:x.find("）")].strip()
        post=x[x.rfind("）")+1:].strip()
    elif '「' in str(x):
        pre=x[:x.find("「")].strip()
        mid=x[x.find("「")+1:x.find("」")].strip()
        post=x[x.rfind("」")+1:].strip()
    elif '『' in str(x):
        pre=x[:x.find("『")].strip()
        mid=x[x.find("（『")+1:x.find("』")].strip()
        post=x[x.rfind("』")+1:].strip()
    else:
        pre=x
        mid='empty'
        post='empty'
    return pre, mid, post

In [113]:
'''
    Reduced to 46188 after removing parenthesis, not so useful
'''

par_train=train_3['artist_name'].apply(split_parenthesis).to_frame()
train_3[['artist_pre_parenthesis', 'artist_in_parenthesis', 'artist_post_parenthesis']]=pd.DataFrame(par_train['artist_name'].tolist())

par_test=test_3['artist_name'].apply(split_parenthesis).to_frame()
test_3[['artist_pre_parenthesis', 'artist_in_parenthesis', 'artist_post_parenthesis']]=pd.DataFrame(par_test['artist_name'].tolist())

# How many artist names are there now
artist_names=set(train_3['artist_pre_parenthesis'].unique()) | set(test_3['artist_pre_parenthesis'].unique())
print("%i unique artist names without parenthesis" % len(artist_names))
del artist_names

46188 unique artist names without parenthesis


In [114]:
# Apply to song name as well
par_train=train_3['song_name'].apply(split_parenthesis).to_frame()
train_3[['song_pre_parenthesis', 'song_in_parenthesis', 'song_post_parenthesis']]=pd.DataFrame(par_train['song_name'].tolist())

par_test=test_3['song_name'].apply(split_parenthesis).to_frame()
test_3[['song_pre_parenthesis', 'song_in_parenthesis', 'song_post_parenthesis']]=pd.DataFrame(par_test['song_name'].tolist())


'''
    Reduced to 237745 after removing parenthesis, not too bad
'''
song_names=set(train_3['song_pre_parenthesis'].unique()) | set(test_3['song_pre_parenthesis'].unique())
print("%i unique song names without parenthesis" % len(song_names))
del song_names

237745 unique song names without parenthesis


In [117]:
# Bar character
def pre_bar(x):
    if '|' in str(x):
        s=x[:x.find("|")].strip()
    elif '|' in str(x):
        s=x[:x.find("|")].strip()
    else:
        s=x.strip()
    if s=="":
        return 'empty'
    else:
        return s
def post_bar(x):
    if '|' in str(x):
        s=x[x.rfind("|")+1:].strip()
    elif '|' in str(x):
        s=x[x.rfind("|")+1:].strip()
    else:
        s=""
    if s=="":
        return 'empty'
    else:
        return s

In [118]:
train_3['artist_pre_bar']=train_3['artist_name'].apply(pre_bar)
test_3['artist_pre_bar']=test_3['artist_name'].apply(pre_bar)

train_3['artist_post_bar']=train_3['artist_name'].apply(post_bar)
test_3['artist_post_bar']=test_3['artist_name'].apply(post_bar)

In [120]:
# Slash
def pre_slash(x):
    if '/' in str(x):
        s=x[:x.find("/")].strip()
    elif '、' in str(x):
        s=x[:x.find("、")].strip()
    else:
        s=x.strip()
    if s=="":
        return 'empty'
    else:
        return s
def post_slash(x):
    if '/' in str(x):
        s=x[x.rfind("/")+1:].strip()
    elif '、' in str(x):
        s=x[x.rfind("、")+1:].strip()
    else:
        s=""
    if s=="":
        return 'empty'
    else:
        return s

In [121]:
train_3['artist_pre_slash']=train_3['artist_name'].apply(pre_slash)
test_3['artist_pre_slash']=test_3['artist_name'].apply(pre_slash)

train_3['artist_post_slash']=train_3['artist_name'].apply(post_slash)
test_3['artist_post_slash']=test_3['artist_name'].apply(post_slash)

In [125]:
# And character
def pre_and(x):
    if '&' in str(x):
        s=x[:x.find("&")].strip()
    elif '＆' in str(x):
        s=x[:x.find("＆")].strip()
    else:
        s=x.strip()
    if s=="":
        return 'empty'
    else:
        return s
def post_and(x):
    if '&' in str(x):
        s=x[x.rfind("&")+1:].strip()
    elif '＆' in str(x):
        s=x[x.rfind("＆")+1:].strip()
    else:
        s=""
    if s=="":
        return 'empty'
    else:
        return s

In [126]:

train_3['artist_pre_and']=train_3['artist_name'].apply(pre_and)
test_3['artist_pre_and']=test_3['artist_name'].apply(pre_and)

train_3['artist_post_and']=train_3['artist_name'].apply(post_and)
test_3['artist_post_and']=test_3['artist_name'].apply(post_and)

In [127]:
# Plus
def pre_plus(x):
    if '+' in str(x):
        s=x[:x.find("+")].strip()
    else:
        s=x.strip()
    if s=="":
        return 'empty'
    else:
        return s
def post_plus(x):
    if '+' in str(x):
        s=x[x.rfind("+")+1:].strip()
    else:
        s=""
    if s=="":
        return 'empty'
    else:
        return s

In [128]:
train_3['artist_pre_plus']=train_3['artist_name'].apply(pre_plus)
test_3['artist_pre_plus']=test_3['artist_name'].apply(pre_plus)

train_3['artist_post_plus']=train_3['artist_name'].apply(post_plus)
test_3['artist_post_plus']=test_3['artist_name'].apply(post_plus)

In [130]:
# Comma
def pre_comma(x):
    if ',' in str(x):
        s=x[:x.find(",")].strip()
    elif '，' in str(x):
        s=x[:x.find("，")].strip()
    else:
        s=x.strip()
    if s=="":
        return 'empty'
    else:
        return s
def post_comma(x):
    if ',' in str(x):
        s=x[x.rfind(",")+1:].strip()
    elif '，' in str(x):
        s=x[:x.find("，")].strip()
    else:
        s=""
    if s=="":
        return 'empty'
    else:
        return s

In [131]:
train_3['artist_pre_comma']=train_3['artist_name'].apply(pre_comma)
test_3['artist_pre_comma']=test_3['artist_name'].apply(pre_comma)

train_3['artist_post_comma']=train_3['artist_name'].apply(post_comma)
test_3['artist_post_comma']=test_3['artist_name'].apply(post_comma)

In [134]:
# Space
def pre_space(x):
    if ' ' in str(x):
        s=x[:x.find(" ")].strip()
    else:
        s=x.strip()
    if s=="":
        return 'empty'
    else:
        return s
def post_space(x):
    if ' ' in str(x):
        s=x[x.rfind(" ")+1:].strip()
    else:
        s=""
    if s=="":
        return 'empty'
    else:
        return s

In [135]:
train_3['artist_pre_space']=train_3['artist_name'].apply(pre_space)
test_3['artist_pre_space']=test_3['artist_name'].apply(pre_space)

train_3['artist_post_space']=train_3['artist_name'].apply(post_space)
test_3['artist_post_space']=test_3['artist_name'].apply(post_space)

In [114]:
def find_special_char(x, c):
    i=x.find(c)
    if i==-1:
        return len(x)
    return i
def before_special_char(x):
    print("String %s" % x)
    #"|"
    bar_idx=min(find_special_char(x,"|"),find_special_char(x,"|"))
    print("Bar at %i" % bar_idx)
    # slash
    slash_idx=min(find_special_char(x,"/"),find_special_char(x,"、"))
    print("Slash at %i" % slash_idx)
    # and
    and_idx=min(find_special_char(x,"&"),find_special_char(x,"＆"))
    print("And at %i" % and_idx)
    # plus
    plus_idx=find_special_char(x,"+")
    print("Plus at %i" % plus_idx)
    # comma
    comma_idx=min(find_special_char(x,","),find_special_char(x,"，"))
    print("Comma at %i" % comma_idx)
    # parenthesis
    par_idx=min(find_special_char(x, "("), find_special_char(x, ")"), find_special_char(x, "（"), find_special_char(x, "）"))
    print("Parenthesis at %i" % par_idx)
    
    before_special=min(bar_idx, slash_idx, and_idx, plus_idx, comma_idx, par_idx)
    extract=x[:before_special].strip()
    print("Before everything %i gets %s" % (before_special, extract))
    return extract
    
    

In [110]:
train_3['artist_name_main']=train_3['artist_name'].apply(before_special_char)
test_3['artist_name_main']=test_3['artist_name'].apply(before_special_char)

In [111]:
test_3['artist_name_main']

0                                  梁文音
1                                  林俊傑
2                         yu takahashi
3                                   u2
4                        yoga mr sound
5                               lee hi
6                                  boa
7                                  周傳雄
8                                  嚴藝丹
9                              bigbang
10                                 潘瑋柏
11                         hoshino gen
12                                 林宥嘉
13                                 賴銘偉
14                                 蕭敬騰
15                     伍佰 & china blue
16                                  楊靜
17                       martin garrix
18                                原子邦妮
19                                  阿牛
20                                 周杰倫
21                                 戴佩妮
22                                 方大同
23                                 陳奕迅
24                             fun4 樂團
25                       

In [112]:
ARTIST_MAIN=set()
def collect_artist_main(x):
    global ARTIST_MAIN
    ARTIST_MAIN.add(x)
train_3['artist_name_main'].apply(collect_artist_main)
test_3['artist_name_main'].apply(collect_artist_main)
print("%i artist names found after extracting special char" % len(ARTIST_MAIN))

44600 artist names found after extracting special char


In [132]:
# Find the non-English parts from artist names
NON_ENGLISH=set()
def is_english(x):
    if all(ord(char) < 128 for char in str(x)):
        return
    else:
        global NON_ENGLISH
        NON_ENGLISH.add(x.strip())
train_3['artist_name'].apply(is_english)
test_3['artist_name'].apply(is_english)
    

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
2556760    None
2556761    None
2556762    None
2556763    None
2556764    None
2556765    None
2556766    None
2556767    None
2556768    None
2556769    None
2556770    None
2556771    None
2556772    None
2556773    None
2556774    None
2556775    None
2556776    None
2556777    None
2556778    None
2556779    None
2556780    None
2556781    None
2556782    None
2556783    None
2556784    None
2556785    None
2556786    None
2556787    None
2556788    None
2556789    None
Name: artist_name, Lengt

In [133]:
not_eng=open('non_english.txt', 'w+')
for s in NON_ENGLISH:
    not_eng.write("%s\n" % s)
not_eng.close()

In [136]:
def in_title_mark(x):
    if '《' in str(x):
        s=x[x.find("《")+1:x.find("》")].strip()
    else:
        s='empty'
    if s=="":
        return 'empty'
    else:
        return s
def pre_title_mark(x):
    if '《' in str(x):
        s=x[:x.find("《")].strip()
    else:
        s='empty'
    if s=="":
        return 'empty'
    else:
        return s
def post_title_mark(x):
    if '》' in str(x):
        s=x[x.rfind("》")+1:].strip()
    else:
        s="empty"
    if s=="":
        return 'empty'
    else:
        return s

In [137]:
train_3['artist_in_title']=train_3['artist_name'].apply(in_title_mark)
test_3['artist_in_title']=test_3['artist_name'].apply(in_title_mark)
train_3['artist_pre_title']=train_3['artist_name'].apply(pre_title_mark)
test_3['artist_pre_title']=test_3['artist_name'].apply(pre_title_mark)
train_3['artist_post_title']=train_3['artist_name'].apply(post_title_mark)
test_3['artist_post_title']=test_3['artist_name'].apply(post_title_mark)