## id_process

In [2]:
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


## load the data
members = pd.read_csv('../source_data/members.csv')
songs = pd.read_csv('../source_data/songs.csv')
songs_extra = pd.read_csv('../source_data/song_extra_info.csv')
train = pd.read_csv('../source_data/train.csv')
test = pd.read_csv('../source_data/test.csv')


song_id_set = set(train['song_id']._append(test['song_id']))

songs['appeared'] = songs['song_id'].apply(lambda x: True if x in song_id_set else False)
songs = songs[songs.appeared]
songs.drop('appeared', axis=1, inplace=True)

songs_extra['appeared'] = songs_extra['song_id'].apply(lambda x: True if x in song_id_set else False)
songs_extra = songs_extra[songs_extra.appeared]
songs_extra.drop('appeared', axis=1, inplace=True)

msno_set = set(train['msno']._append(test['msno']))

members['appeared'] = members['msno'].apply(lambda x: True if x in msno_set else False)
members = members[members.appeared]
members.drop('appeared', axis=1, inplace=True)

print('Data loaded.')


## preprocess msno and song_id
msno_encoder = LabelEncoder()
msno_encoder.fit(members['msno'].values)
members['msno'] = msno_encoder.transform(members['msno'])
train['msno'] = msno_encoder.transform(train['msno'])
test['msno'] = msno_encoder.transform(test['msno'])

print('MSNO done.')

song_id_encoder = LabelEncoder()
song_id_encoder.fit(train['song_id']._append(test['song_id']))
songs['song_id'] = song_id_encoder.transform(songs['song_id'])
songs_extra['song_id'] = song_id_encoder.transform(songs_extra['song_id'])
train['song_id'] = song_id_encoder.transform(train['song_id'])
test['song_id'] = song_id_encoder.transform(test['song_id'])

print('Song_id done.')

## preprocess the features in train.csv & test.csv
columns = ['source_system_tab', 'source_screen_name', 'source_type']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(train[column]._append(test[column]))
    train[column] = column_encoder.transform(train[column])
    test[column] = column_encoder.transform(test[column])

print('Source information done.')


## preprocess the features in members.csv
columns = ['city', 'gender', 'registered_via']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(members[column])
    members[column] = column_encoder.transform(members[column])

members['registration_init_time'] = members['registration_init_time'].apply(lambda x: \
        time.mktime(time.strptime(str(x),'%Y%m%d')))
members['expiration_date'] = members['expiration_date'].apply(lambda x: \
        time.mktime(time.strptime(str(x),'%Y%m%d')))

print('Members information done.')

## preprocess the features in songs.csv
genre_id = np.zeros((len(songs), 4))
for i in range(len(songs)):
    if not isinstance(songs['genre_ids'].values[i], str):
        continue
    ids = str(songs['genre_ids'].values[i]).split('|')
    if len(ids) > 2:
        genre_id[i, 0] = int(ids[0])
        genre_id[i, 1] = int(ids[1])
        genre_id[i, 2] = int(ids[2])
    elif len(ids) > 1:
        genre_id[i, 0] = int(ids[0])
        genre_id[i, 1] = int(ids[1])
    elif len(ids) == 1:
        genre_id[i, 0] = int(ids[0])
    genre_id[i, 3] = len(ids)
songs['first_genre_id'] = genre_id[:, 0]
songs['second_genre_id'] = genre_id[:, 1]
songs['third_genre_id'] = genre_id[:, 2]
songs['genre_id_cnt'] = genre_id[:, 3]

genre_encoder = LabelEncoder()
genre_encoder.fit((songs.first_genre_id._append(songs.second_genre_id))._append(songs.third_genre_id))
songs['first_genre_id'] = genre_encoder.transform(songs['first_genre_id'])
songs['second_genre_id'] = genre_encoder.transform(songs['second_genre_id'])
songs['third_genre_id'] = genre_encoder.transform(songs['third_genre_id'])
songs.drop('genre_ids', axis=1, inplace=True)

def artist_count(x):
    return x.count('and') + x.count(',') + x.count(' feat') + x.count('&') + 1

songs['artist_cnt'] = songs['artist_name'].apply(artist_count).astype(np.int8)

def get_count(x):
    try:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    except:
        return 0

songs['lyricist_cnt'] = songs['lyricist'].apply(get_count).astype(np.int8)
songs['composer_cnt'] = songs['composer'].apply(get_count).astype(np.int8)

songs['is_featured'] = songs['artist_name'].apply(lambda x: 1 if ' feat' \
        in str(x) else 0).astype(np.int8)

def get_first_artist(x):
    if x.count('and') > 0:
        x = x.split('and')[0]
    if x.count(',') > 0:
        x = x.split(',')[0]
    if x.count(' feat') > 0:
        x = x.split(' feat')[0]
    if x.count('&') > 0:
        x = x.split('&')[0]
    return x.strip()

songs['artist_name'] = songs['artist_name'].apply(get_first_artist)
    
def get_first_term(x):
    try:
        if x.count('|') > 0:
            x = x.split('|')[0]
        if x.count('/') > 0:
            x = x.split('/')[0]
        if x.count('\\') > 0:
            x = x.split('\\')[0]
        if x.count(';') > 0:
            x = x.split(';')[0]
        return x.strip()
    except:
        return x

songs['lyricist'] = songs['lyricist'].apply(get_first_term)
songs['composer'] = songs['composer'].apply(get_first_term)        

songs['language'] = songs['language'].fillna(-1)
columns = ['artist_name', 'lyricist', 'composer', 'language']
for column in columns:
    column_encoder = LabelEncoder()
    column_encoder.fit(songs[column])
    songs[column] = column_encoder.transform(songs[column])


## save files
members.to_csv('../temporal_data/members_id.csv', index=False)
songs.to_csv('../temporal_data/songs_id.csv', index=False)
songs_extra.to_csv('../temporal_data/songs_extra_id.csv', index=False)
train.to_csv('../temporal_data/train_id.csv', index=False)
test.to_csv('../temporal_data/test_id.csv', index=False)
print(0)

Data loaded.
MSNO done.
Song_id done.
Source information done.
Members information done.
0


## cnt_log_process

In [5]:
import numpy as np
import pandas as pd

train = pd.read_csv('../temporal_data/train_id.csv')
test = pd.read_csv('../temporal_data/test_id.csv')
member = pd.read_csv('../temporal_data/members_id.csv')
song_origin = pd.read_csv('../temporal_data/songs_id.csv')
song_extra = pd.read_csv('../temporal_data/songs_extra_id.csv')

song = pd.DataFrame({'song_id': range(max(train.song_id.max(), test.song_id.max())+1)})
song = song.merge(song_origin, on='song_id', how='left')
song = song.merge(song_extra, on='song_id', how='left')

data = train[['msno', 'song_id']]._append(test[['msno', 'song_id']])

## member_cnt
mem_rec_cnt = data.groupby(by='msno').count()['song_id'].to_dict()
member['msno_rec_cnt'] = member['msno'].apply(lambda x: mem_rec_cnt[x])

member['bd'] = member['bd'].apply(lambda x: np.nan if x <= 0 or x >= 75 else x)

## song_cnt
artist_song_cnt = song.groupby(by='artist_name').count()['song_id'].to_dict()
song['artist_song_cnt'] = song['artist_name'].apply(lambda x: artist_song_cnt[x] if not np.isnan(x) else np.nan)

composer_song_cnt = song.groupby(by='composer').count()['song_id'].to_dict()
composer_song_cnt[0] = np.nan
song['composer_song_cnt'] = song['composer'].apply(lambda x: composer_song_cnt[x] if not np.isnan(x) else np.nan)

lyricist_song_cnt = song.groupby(by='lyricist').count()['song_id'].to_dict()
lyricist_song_cnt[0] = np.nan
song['lyricist_song_cnt'] = song['lyricist'].apply(lambda x: lyricist_song_cnt[x] if not np.isnan(x) else np.nan)

genre_song_cnt = song.groupby(by='first_genre_id').count()['song_id'].to_dict()
genre_song_cnt[0] = np.nan
song['genre_song_cnt'] = song['first_genre_id'].apply(lambda x: genre_song_cnt[x] if not np.isnan(x) else np.nan)

data = data.merge(song, on='song_id', how='left')

song_rec_cnt = data.groupby(by='song_id').count()['msno'].to_dict()
song['song_rec_cnt'] = song['song_id'].apply(lambda x: song_rec_cnt[x] if not np.isnan(x) else np.nan)

artist_rec_cnt = data.groupby(by='artist_name').count()['msno'].to_dict()
song['artist_rec_cnt'] = song['artist_name'].apply(lambda x: artist_rec_cnt[x] if not np.isnan(x) else np.nan)

composer_rec_cnt = data.groupby(by='composer').count()['msno'].to_dict()
composer_rec_cnt[0] = np.nan
song['composer_rec_cnt'] = song['composer'].apply(lambda x: composer_rec_cnt[x] if not np.isnan(x) else np.nan)

lyricist_rec_cnt = data.groupby(by='lyricist').count()['msno'].to_dict()
lyricist_rec_cnt[0] = np.nan
song['lyricist_rec_cnt'] = song['lyricist'].apply(lambda x: lyricist_rec_cnt[x] if not np.isnan(x) else np.nan)

genre_rec_cnt = data.groupby(by='first_genre_id').count()['msno'].to_dict()
genre_rec_cnt[0] = np.nan
song['genre_rec_cnt'] = song['first_genre_id'].apply(lambda x: genre_rec_cnt[x] if not np.isnan(x) else np.nan)

## msno context features
dummy_feat = ['source_system_tab', 'source_screen_name', 'source_type']
concat = train.drop('target', axis=1)._append(test.drop('id', axis=1))

# for feat in dummy_feat:
#     feat_dummies = pd.get_dummies(concat[feat])
#     feat_dummies.columns = ['msno_%s_'%feat + '%s'%col for col in feat_dummies.columns]
#     feat_dummies['msno'] = concat['msno'].values
#     feat_dummies = feat_dummies.groupby('msno').mean()
#     feat_dummies['msno'] = feat_dummies.index
#     member = member.merge(feat_dummies, on='msno', how='left')

for feat in dummy_feat:
    feat_dummies = pd.get_dummies(concat[feat])
    feat_dummies.columns = ['msno_%s_'%feat + '%s'%col for col in feat_dummies.columns]
    feat_dummies['msno'] = concat['msno'].values
    feat_dummies = feat_dummies.groupby('msno').mean()
    feat_dummies['msno_new'] = feat_dummies.index  
    feat_dummies.reset_index(drop=True, inplace=True)  
    member = member.merge(feat_dummies, left_on='msno', right_on='msno_new', how='left')  

train_temp = train.merge(member, on='msno', how='left')
test_temp = test.merge(member, on='msno', how='left')

train['msno_source_system_tab_prob'] = train_temp[[col for col in train_temp.columns if 'source_system_tab' in col]].apply(lambda x: \
        x['msno_source_system_tab_%d'%x['source_system_tab']], axis=1)
test['msno_source_system_tab_prob'] = test_temp[[col for col in test_temp.columns if 'source_system_tab' in col]].apply(lambda x: \
        x['msno_source_system_tab_%d'%x['source_system_tab']], axis=1)

train['msno_source_screen_name_prob'] = train_temp[[col for col in train_temp.columns if 'source_screen_name' in col]].apply(lambda x: \
        x['msno_source_screen_name_%d'%x['source_screen_name']], axis=1)
test['msno_source_screen_name_prob'] = test_temp[[col for col in test_temp.columns if 'source_screen_name' in col]].apply(lambda x: \
        x['msno_source_screen_name_%d'%x['source_screen_name']], axis=1)

train['msno_source_type_prob'] = train_temp[[col for col in train_temp.columns if 'source_type' in col]].apply(lambda x: \
        x['msno_source_type_%d'%x['source_type']], axis=1)
test['msno_source_type_prob'] = test_temp[[col for col in test_temp.columns if 'source_type' in col]].apply(lambda x: \
        x['msno_source_type_%d'%x['source_type']], axis=1)

## to_csv
features = ['msno_rec_cnt']
for feat in features:
    member[feat] = np.log1p(member[feat])
member.to_csv('../temporal_data/members_id_cnt.csv', index=False)

features = ['song_length', 'song_rec_cnt', 'artist_song_cnt', 'composer_song_cnt', \
        'lyricist_song_cnt', 'genre_song_cnt', 'artist_rec_cnt', \
        'composer_rec_cnt', 'lyricist_rec_cnt', 'genre_rec_cnt']
for feat in features:
    song[feat] = np.log1p(song[feat])
#song['song_length'] = np.log1p(song['song_length'])
song.to_csv('../temporal_data/songs_id_cnt.csv', index=False)

train.to_csv('../temporal_data/train_id_cnt.csv', index=False)
test.to_csv('../temporal_data/test_id_cnt.csv', index=False)
print(0)

0


## isrc_process

In [8]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

## load the data
train = pd.read_csv('../temporal_data/train_id.csv')
test = pd.read_csv('../temporal_data/test_id.csv')
song = pd.read_csv('../temporal_data/songs_id_cnt.csv')

data = train[['msno', 'song_id']]._append(test[['msno', 'song_id']])

print('Data loaded.')

## isrc process
isrc = song['isrc']
song['cc'] = isrc.str.slice(0, 2)
song['xxx'] = isrc.str.slice(2, 5)
song['yy'] = isrc.str.slice(5, 7).astype(float)
song['yy'] = song['yy'].apply(lambda x: 2000+x if x < 18 else 1900+x)

song['cc'] = LabelEncoder().fit_transform(song['cc'])
song['xxx'] = LabelEncoder().fit_transform(song['xxx'])
song['isrc_missing'] = (song['cc'] == 0) * 1.0

## song_cnt
song_cc_cnt = song.groupby(by='cc').count()['song_id'].to_dict()
song_cc_cnt[0] = None
song['cc_song_cnt'] = song['cc'].apply(lambda x: song_cc_cnt[x] if not np.isnan(x) else None)

song_xxx_cnt = song.groupby(by='xxx').count()['song_id'].to_dict()
song_xxx_cnt[0] = None
song['xxx_song_cnt'] = song['xxx'].apply(lambda x: song_xxx_cnt[x] if not np.isnan(x) else None)

song_yy_cnt = song.groupby(by='yy').count()['song_id'].to_dict()
song_yy_cnt[0] = None
song['yy_song_cnt'] = song['yy'].apply(lambda x: song_yy_cnt[x] if not np.isnan(x) else None)

data = data.merge(song, on='song_id', how='left')

song_cc_cnt = data.groupby(by='cc').count()['msno'].to_dict()
song_cc_cnt[0] = None
song['cc_rec_cnt'] = song['cc'].apply(lambda x: song_cc_cnt[x] if not np.isnan(x) else None)

song_xxx_cnt = data.groupby(by='xxx').count()['msno'].to_dict()
song_xxx_cnt[0] = None
song['xxx_rec_cnt'] = song['xxx'].apply(lambda x: song_xxx_cnt[x] if not np.isnan(x) else None)

song_yy_cnt = data.groupby(by='yy').count()['msno'].to_dict()
song_yy_cnt[0] = None
song['yy_rec_cnt'] = song['yy'].apply(lambda x: song_yy_cnt[x] if not np.isnan(x) else None)

## to_csv
features = ['cc_song_cnt', 'xxx_song_cnt', 'yy_song_cnt', 'cc_rec_cnt', \
        'xxx_rec_cnt', 'yy_rec_cnt']
for feat in features:
    song[feat] = np.log1p(song[feat])

song.drop(['name', 'isrc'], axis=1, inplace=True)
song.to_csv('../temporal_data/songs_id_cnt_isrc.csv', index=False)
print(0)

Data loaded.
0


## svd_process

In [11]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse.linalg import svds

## load the data
tr = pd.read_csv('../temporal_data/train_id_cnt.csv')
te = pd.read_csv('../temporal_data/test_id_cnt.csv')
member = pd.read_csv('../temporal_data/members_id_cnt.csv')
song = pd.read_csv('../temporal_data/songs_id_cnt_isrc.csv')

concat = tr[['msno', 'song_id']]._append(te[['msno', 'song_id']])
member_cnt = concat['msno'].max() + 1
song_cnt = concat['song_id'].max() + 1
artist_cnt = int(song['artist_name'].max() + 1)

## svd for user-song pairs
n_component = 48

print(len(concat))

data = np.ones(len(concat))
msno = concat['msno'].values
song_id = concat['song_id'].values

rating = sparse.coo_matrix((data, (msno, song_id)))
rating = (rating > 0) * 1.0

[u, s, vt] = svds(rating, k=n_component)
print(s[::-1])
s_song = np.diag(s[::-1])

members_topics = pd.DataFrame(u[:, ::-1])
members_topics.columns = ['member_component_%d'%i for i in range(n_component)]
members_topics['msno'] = range(member_cnt)
member = member.merge(members_topics, on='msno', how='right')

song_topics = pd.DataFrame(vt.transpose()[:, ::-1])
song_topics.columns = ['song_component_%d'%i for i in range(n_component)]
song_topics['song_id'] = range(song_cnt)
song = song.merge(song_topics, on='song_id', how='right')

## svd for user-artist pairs
n_component = 16

concat = concat.merge(song[['song_id', 'artist_name']], on='song_id', how='left')
concat = concat[concat['artist_name'] >= 0]
msno = concat['msno'].values
artist = concat['artist_name'].values.astype(int)

print(len(concat))
data = np.ones(len(concat))
rating_tmp = sparse.coo_matrix((data, (msno, artist)))

rating = np.log1p(rating_tmp) * 0.3 + (rating_tmp > 0) * 1.0

[u, s, vt] = svds(rating, k=n_component)
print(s[::-1])
s_artist = np.diag(s[::-1])

members_topics = pd.DataFrame(u[:, ::-1])
members_topics.columns = ['member_artist_component_%d'%i for i in range(n_component)]
members_topics['msno'] = range(member_cnt)
member = member.merge(members_topics, on='msno', how='left')


artist_topics = pd.DataFrame(vt.transpose()[:, ::-1])
artist_topics.columns = ['artist_component_%d'%i for i in range(n_component)]
artist_topics['artist_name'] = range(artist_cnt)
song = song.merge(artist_topics, on='artist_name', how='left')

## dot features
member = member.sort_values(by='msno')
song = song.sort_values(by='song_id')

mem_cols = ['member_component_%d'%i for i in range(48)]
song_cols = ['song_component_%d'%i for i in range(48)]

member_embeddings = member[mem_cols].values
song_embeddings = song[song_cols].values

mem_cols = ['member_artist_component_%d'%i for i in range(16)]
song_cols = ['artist_component_%d'%i for i in range(16)]

member_artist_embeddings = member[mem_cols].values
song_artist_embeddings = song[song_cols].values

train_dot = np.zeros((len(tr), 2))
test_dot = np.zeros((len(te), 2))

for i in range(len(tr)):
    msno_idx = tr['msno'].values[i]
    song_idx = tr['song_id'].values[i]
    
    train_dot[i, 0] = np.dot(member_embeddings[msno_idx], np.dot(s_song, song_embeddings[song_idx]))
    train_dot[i, 1] = np.dot(member_artist_embeddings[msno_idx], np.dot(s_artist, song_artist_embeddings[song_idx]))

for i in range(len(te)):
    msno_idx = te['msno'].values[i]
    song_idx = te['song_id'].values[i]
    
    test_dot[i, 0] = np.dot(member_embeddings[msno_idx], np.dot(s_song, song_embeddings[song_idx]))
    test_dot[i, 1] = np.dot(member_artist_embeddings[msno_idx], np.dot(s_artist, song_artist_embeddings[song_idx]))

tr['song_embeddings_dot'] = train_dot[:, 0]
tr['artist_embeddings_dot'] = train_dot[:, 1]

te['song_embeddings_dot'] = test_dot[:, 0]
te['artist_embeddings_dot'] = test_dot[:, 1]

## write to files
tr.to_csv('../temporal_data/train_id_cnt_svd.csv', index=False)
te.to_csv('../temporal_data/test_id_cnt_svd.csv', index=False)
member.to_csv('../temporal_data/members_id_cnt_svd.csv', index=False)
song.to_csv('../temporal_data/songs_id_cnt_isrc_svd.csv', index=False)
print(0)

9934208
[1079.09701569  402.36101388  342.15015604  290.25766789  267.48502621
  209.99166484  201.41980994  186.60844183  176.48011044  171.87019751
  166.79121794  164.30979427  159.6872633   152.93042253  148.44190296
  146.07344883  138.47712124  136.88805245  132.59873434  132.02263033
  130.96092559  129.04862187  125.54538462  124.33609337  121.7514836
  120.57790438  119.51483604  117.37102969  116.98083352  115.27806765
  113.75112337  112.84448524  109.47369206  108.56299432  106.69113104
  106.64203013  105.20102045  102.78250425  101.55658569  100.48101846
  100.1085423    98.3309683    98.05062556   97.82077495   97.20829481
   96.1850887    95.76546976   95.22900134]
9934069
[1408.36398716  455.73519615  338.21892276  322.67571439  277.07120791
  243.03345467  197.06733078  193.43869496  177.60460117  168.79532564
  165.20992676  162.26970943  154.00695384  151.47375268  150.00775276
  140.89518947]
0


## timestamp_process

In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict

## load the data
tr = pd.read_csv('../temporal_data/train_id_cnt_svd.csv')
te = pd.read_csv('../temporal_data/test_id_cnt_svd.csv')
mem = pd.read_csv('../temporal_data/members_id_cnt_svd.csv')
song = pd.read_csv('../temporal_data/songs_id_cnt_isrc_svd.csv')

## continous index
concat = tr[['msno', 'song_id']]._append(te[['msno', 'song_id']])
concat['timestamp'] = range(len(concat))

## windows_based count
window_sizes = [10, 25, 500, 5000, 10000, 50000]

msno_list = concat['msno'].values
song_list = concat['song_id'].values

def get_window_cnt(values, idx, window_size):
    lower = max(0, idx-window_size)
    upper = min(len(values), idx+window_size)
    return (values[lower:idx] == values[idx]).sum(), (values[idx:upper] == values[idx]).sum()

for window_size in window_sizes:
    msno_before_cnt = np.zeros(len(concat))
    song_before_cnt = np.zeros(len(concat))
    msno_after_cnt = np.zeros(len(concat))
    song_after_cnt = np.zeros(len(concat))
    for i in range(len(concat)):
        msno_before_cnt[i], msno_after_cnt[i] = get_window_cnt(msno_list, i, window_size)
        song_before_cnt[i], song_after_cnt[i] = get_window_cnt(song_list, i, window_size)
    concat['msno_%d_before_cnt'%window_size] = msno_before_cnt
    concat['song_%d_before_cnt'%window_size] = song_before_cnt
    concat['msno_%d_after_cnt'%window_size] = msno_after_cnt
    concat['song_%d_after_cnt'%window_size] = song_after_cnt
    
    print('Window size for %d done.'%window_size)

## till_now count
msno_dict = defaultdict(lambda: 0)
song_dict = defaultdict(lambda: 0)

msno_till_now_cnt = np.zeros(len(concat))
song_till_now_cnt = np.zeros(len(concat))
for i in range(len(concat)):
    msno_till_now_cnt[i] = msno_dict[msno_list[i]]
    msno_dict[msno_list[i]] += 1
    
    song_till_now_cnt[i] = song_dict[song_list[i]]
    song_dict[song_list[i]] += 1

concat['msno_till_now_cnt'] = msno_till_now_cnt
concat['song_till_now_cnt'] = song_till_now_cnt

print('Till-now count done.')

## varience
def timestamp_map(x):
    if x < 7377418:
        x = (x - 0.0) / (7377417.0 - 0.0) * (1484236800.0 - 1471190400.0) + 1471190400.0
    else:
        x = (x - 7377417.0) / (9934207.0 - 7377417.0) * (1488211200.0 - 1484236800.0) + 1484236800.0

    return x
    
concat['timestamp'] = concat['timestamp'].apply(timestamp_map)

msno_mean = concat.groupby(by='msno').mean()['timestamp'].to_dict()
mem['msno_timestamp_mean'] = mem['msno'].apply(lambda x: msno_mean[x])

msno_std = concat.groupby(by='msno').std()['timestamp'].to_dict()
mem['msno_timestamp_std'] = mem['msno'].apply(lambda x: msno_std[x])

song_mean = concat.groupby(by='song_id').mean()['timestamp'].to_dict()
song['song_timestamp_mean'] = song['song_id'].apply(lambda x: song_mean[x])

song_std = concat.groupby(by='song_id').std()['timestamp'].to_dict()
song['song_timestamp_std'] = song['song_id'].apply(lambda x: song_std[x])

print('Varience done.')

## save to files
features = ['msno_till_now_cnt', 'song_till_now_cnt']
for window_size in window_sizes:
    features += ['msno_%d_before_cnt'%window_size, 'song_%d_before_cnt'%window_size, \
            'msno_%d_after_cnt'%window_size, 'song_%d_after_cnt'%window_size]
for feat in features:
    concat[feat] = np.log1p(concat[feat])

features = ['timestamp'] + features

data = concat[features].values
for i in range(len(features)):
    tr[features[i]] = data[:len(tr), i]
    te[features[i]] = data[len(tr):, i]

tr.to_csv('../temporal_data/train_id_cnt_svd_stamp.csv', index=False)
te.to_csv('../temporal_data/test_id_cnt_svd_stamp.csv', index=False)
mem.to_csv('../temporal_data/members_id_cnt_svd_stamp.csv', index=False)
song.to_csv('../temporal_data/songs_id_cnt_isrc_svd_stamp.csv', index=False)
print(0)

Window size for 10 done.
Window size for 25 done.
Window size for 500 done.
Window size for 5000 done.
Window size for 10000 done.
Window size for 50000 done.
Till-now count done.
Varience done.
0


## before_after_process

In [15]:
import numpy as np
import pandas as pd
from collections import defaultdict

## load data
tr = pd.read_csv('../temporal_data/train_id_cnt_svd_stamp.csv')
te = pd.read_csv('../temporal_data/test_id_cnt_svd_stamp.csv')

print('data loaded.')
print(len(tr))
print(len(te))

## continous index
concat = tr[['msno', 'song_id', 'source_type', 'source_screen_name', 'timestamp']]._append(te[['msno', \
        'song_id', 'source_type', 'source_screen_name', 'timestamp']])

## before data
song_dict = defaultdict(lambda: None)
type_dict = defaultdict(lambda: None)
name_dict = defaultdict(lambda: None)
time_dict = defaultdict(lambda: None)

before_data = np.zeros((len(concat), 4))
for i in range(len(concat)):
    msno = concat['msno'].values[i]
    
    if(song_dict[msno] == None):
        before_data[i] = concat[['song_id', 'source_type', 'source_screen_name', 'timestamp']].values[i]
        before_data[i, 3] = np.nan
    else:
        before_data[i, 0] = song_dict[msno]
        before_data[i, 1] = type_dict[msno]
        before_data[i, 2] = name_dict[msno]
        before_data[i, 3] = time_dict[msno]

    song_dict[msno] = concat['song_id'].values[i]
    type_dict[msno] = concat['source_type'].values[i]
    name_dict[msno] = concat['source_screen_name'].values[i]
    time_dict[msno] = concat['timestamp'].values[i]

print('data before done.')

## after data
song_dict = defaultdict(lambda: None)
type_dict = defaultdict(lambda: None)
name_dict = defaultdict(lambda: None)
time_dict = defaultdict(lambda: None)

after_data = np.zeros((len(concat), 4))
for i in range(len(concat))[::-1]:
    msno = concat['msno'].values[i]
    
    if(song_dict[msno] == None):
        after_data[i] = concat[['song_id', 'source_type', 'source_screen_name', 'timestamp']].values[i]
        after_data[i, 3] = np.nan
    else:
        after_data[i, 0] = song_dict[msno]
        after_data[i, 1] = type_dict[msno]
        after_data[i, 2] = name_dict[msno]
        after_data[i, 3] = time_dict[msno]

    song_dict[msno] = concat['song_id'].values[i]
    type_dict[msno] = concat['source_type'].values[i]
    name_dict[msno] = concat['source_screen_name'].values[i]
    time_dict[msno] = concat['timestamp'].values[i]

print('data after done.')

## to_csv
idx = 0
for i in ['song_id', 'source_type', 'source_screen_name', 'timestamp']:
    tr['before_'+i] = before_data[:len(tr), idx]
    tr['after_'+i] = after_data[:len(tr), idx]
    
    te['before_'+i] = before_data[len(tr):, idx]
    te['after_'+i] = after_data[len(tr):, idx]
    
    idx += 1

for i in ['song_id', 'source_type', 'source_screen_name']:
    tr['before_'+i] = tr['before_'+i].astype(int)
    te['before_'+i] = te['before_'+i].astype(int)
    tr['after_'+i] = tr['after_'+i].astype(int)
    te['after_'+i] = te['after_'+i].astype(int)

tr['before_timestamp'] = np.log1p(tr['timestamp'] - tr['before_timestamp'])
te['before_timestamp'] = np.log1p(te['timestamp'] - te['before_timestamp'])

tr['after_timestamp'] = np.log1p(tr['after_timestamp'] - tr['timestamp'])
te['after_timestamp'] = np.log1p(te['after_timestamp'] - te['timestamp'])

tr['before_timestamp'].fillna(np.nanmean(tr['before_timestamp']), inplace=True)
te['before_timestamp'].fillna(np.nanmean(te['before_timestamp']), inplace=True)
tr['after_timestamp'].fillna(np.nanmean(tr['after_timestamp']), inplace=True)
te['after_timestamp'].fillna(np.nanmean(te['after_timestamp']), inplace=True)

tr.to_csv('../temporal_data/train_id_cnt_svd_stamp_before_after.csv', index=False)
te.to_csv('../temporal_data/test_id_cnt_svd_stamp_before_after.csv', index=False)
print(0)

data loaded.
7377418
2556790
data before done.
data after done.
0


## data_for_training

In [16]:
import os
import numpy as np
import pandas as pd

## load the data
train = pd.read_csv('../temporal_data/train_id_cnt_svd_stamp_before_after.csv')
test = pd.read_csv('../temporal_data/test_id_cnt_svd_stamp_before_after.csv')
member = pd.read_csv('../temporal_data/members_id_cnt_svd_stamp.csv')
song = pd.read_csv('../temporal_data/songs_id_cnt_isrc_svd_stamp.csv')

## prepare data for train / test
train.to_csv('../train.csv', index=False, float_format='%.6f')
test.to_csv('../test.csv', index=False, float_format='%.6f')

train['iid'] = train['song_id'] * 100000 + train['msno']
test['iid'] = test['song_id'] * 100000 + test['msno']

iid_set = set(test['iid'].values)
train['appeared'] = train['iid'].apply(lambda x: x in iid_set)
train = train[train['appeared'] == False]

train.drop(['iid', 'appeared'], axis=1, inplace=True)
train.to_csv('../train_part.csv', index=False, float_format='%.6f')

## prepare data for member / song for GBDT
member.to_csv('../members_gbdt.csv', index=False)

columns = ['composer', 'lyricist', 'language', 'first_genre_id', 'second_genre_id', 'third_genre_id']
for col in columns:
    song[col].fillna(0, inplace=True)
    song[col] = song[col].astype(int)
song['artist_name'].fillna(np.max(song['artist_name'])+1, inplace=True)
song['artist_name'] = song['artist_name'].astype(int)
song['isrc_missing'] = song['isrc_missing'].astype(int)
song.to_csv('../songs_gbdt.csv', index=False)

## prepare data for member / song for NN
member['bd_missing'] = np.isnan(member['bd'].values) * 1

columns = ['bd']
for col in columns:
    member[col].fillna(np.nanmean(member[col]), inplace=True)

member['msno_timestamp_std'].fillna(np.nanmin(member['msno_timestamp_std']), inplace=True)
member.to_csv('../members_nn.csv', index=False)

song['song_id_missing'] = np.isnan(song['song_length'].values) * 1

columns = ['song_length', 'genre_id_cnt', 'artist_song_cnt', 'composer_song_cnt', \
       'lyricist_song_cnt', 'genre_song_cnt', 'song_rec_cnt', \
       'artist_rec_cnt', 'composer_rec_cnt', 'lyricist_rec_cnt', \
       'genre_rec_cnt', 'yy', 'cc_song_cnt', \
       'xxx_song_cnt', 'yy_song_cnt', 'cc_rec_cnt', 'xxx_rec_cnt', \
       'yy_rec_cnt', 'song_timestamp_std', 'artist_cnt', 'lyricist_cnt', \
       'composer_cnt', 'is_featured'] + ['artist_component_%d'%i for i in range(16)]
for col in columns:
    song[col].fillna(np.nanmean(song[col]), inplace=True)

song.to_csv('../songs_nn.csv', index=False)
print(0)

0


## experiments

In [1]:
import pandas as pd
import numpy as np

tr = pd.read_csv('../train_part.csv')
te = pd.read_csv('../test.csv')
song = pd.read_csv('../songs_nn.csv')

concat = tr[['msno', 'song_id', 'source_system_tab', 'source_screen_name', \
        'source_type']]._append(te[['msno', 'song_id', 'source_system_tab', \
        'source_screen_name', 'source_type']])
concat = concat.merge(song[['song_id', 'song_length', 'artist_name', 'first_genre_id', \
        'artist_rec_cnt', 'song_rec_cnt', 'artist_song_cnt', 'xxx', 'yy', \
        'language']], on='song_id', how='left')

concat['source'] = concat['source_system_tab'] * 10000 + concat['source_screen_name'] * 100 + \
        concat['source_type']
from sklearn.preprocessing import LabelEncoder
concat['source'] = LabelEncoder().fit_transform(concat['source'].values)

## member features

mem_add = pd.DataFrame({'msno': range(concat['msno'].max()+1)})
data_avg = concat[['msno', 'song_length', 'artist_song_cnt', \
        'artist_rec_cnt', 'song_rec_cnt', 'yy']].groupby('msno').mean()
data_avg.columns = ['msno_'+i+'_mean' for i in data_avg.columns]
data_avg['msno'] = data_avg.index.values
data_avg.reset_index(drop=True, inplace=True)
mem_add = mem_add.merge(data_avg, on='msno', how='left')

data_std = concat[['msno', 'song_length', 'artist_song_cnt', \
        'artist_rec_cnt', 'song_rec_cnt', 'yy']].groupby('msno').std()
data_std.columns = ['msno_'+i+'_std' for i in data_std.columns]
data_std['msno'] = data_std.index.values
data_std.reset_index(drop=True, inplace=True)
mem_add = mem_add.merge(data_std, on='msno', how='left')

artist_msno = concat[['msno', 'artist_name']].groupby('msno').apply(lambda x: len(set(x['artist_name'].values)))
mem_add['artist_msno_cnt'] = artist_msno
mem_add['artist_msno_cnt'] = np.log1p(mem_add['artist_msno_cnt'])

language_dummy = pd.get_dummies(concat['language'])
language_dummy['msno'] = concat['msno'].values
language_prob = language_dummy.groupby('msno').mean()
language_prob.columns = ['msno_language_%d'%i for i in language_prob.columns]
language_prob['msno'] = language_prob.index
language_prob.reset_index(drop=True, inplace=True)
mem_add = mem_add.merge(language_prob, on='msno', how='left')

mem_add.to_csv('../members_add.csv', index=False)

## train/test features

col = ['artist_name', 'first_genre_id', 'xxx', 'language', 'yy', 'source']
for feat in col:
    concat['id'] = concat['msno'] * 100000 + concat[feat]
    id_cnt = concat[['msno', 'id']].groupby('id').count().to_dict()['msno']
    concat['msno_'+feat+'_cnt'] = concat['id'].apply(lambda x: id_cnt[x])

msno_cnt = concat[['msno', 'song_id']].groupby('msno').count().to_dict()['song_id']
concat['msno_cnt'] = concat['msno'].apply(lambda x: msno_cnt[x])
for feat in col:
    concat['msno_'+feat+'_prob'] = concat['msno_'+feat+'_cnt'] / concat['msno_cnt']

cols = ['source_system_tab', 'source_screen_name', 'source_type']
for col in cols:
    concat['id'] = concat['song_id'] * 10000 + concat[col]
    id_cnt = concat[['msno', 'id']].groupby('id').count().to_dict()['msno']
    concat['song_'+col+'_cnt'] = concat['id'].apply(lambda x: id_cnt[x])

song_cnt = concat[['msno', 'song_id']].groupby('song_id').count().to_dict()['msno']
concat['song_cnt'] = concat['song_id'].apply(lambda x: song_cnt[x])

for col in cols:
    concat['song_'+col+'_prob'] = concat['song_'+col+'_cnt'] / concat['song_cnt']

result = concat[['msno_artist_name_prob', 'msno_first_genre_id_prob', 'msno_xxx_prob', \
        'msno_language_prob', 'msno_yy_prob', 'song_source_system_tab_prob', \
        'song_source_screen_name_prob', 'song_source_type_prob', 'source', 'msno_source_prob']]

result[:len(tr)].to_csv('../train_part_add.csv', index=False)
result[len(tr):].to_csv('../test_add.csv', index=False)
print(0)

0
