# WSDM Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import gc

In [2]:
print('Loading data...')
data_path = '/home/huangcr/Kaggle/wsdm/data/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

members = pd.read_csv(data_path + 'members.csv')

songs = pd.read_csv(data_path + 'songs.csv')
parse_dates=['registration_init_time','expiration_date']
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
print('Done ')

Loading data...
Done 


## Members

In [3]:
#提取年份
def convert_to_year(x):
    return int(str(x)[:4])
def convert_to_month(x):
    return int(str(x)[4:6])
def convert_to_day(x):
    return int(str(x)[6:])

members['reg_year'] = members['registration_init_time'].apply(convert_to_year)
members['reg_month'] = members['registration_init_time'].apply(convert_to_month)
members['reg_day'] = members['registration_init_time'].apply(convert_to_day)

members['ex_year'] = members['expiration_date'].apply(convert_to_year)
members['ex_month'] = members['expiration_date'].apply(convert_to_month)
members['ex_day'] = members['expiration_date'].apply(convert_to_day)
#函数
def convert_to_datetime(x):
    return pd.to_datetime(x, format= '%Y%m%d', errors= 'ignore')

members['registration_init_time'] = members['registration_init_time'].apply(convert_to_datetime)
members['expiration_date'] = members['expiration_date'].apply(convert_to_datetime)

members['duration'] = members['expiration_date'] - members['registration_init_time']

members['duration'] = members['duration'].apply(lambda x: x.total_seconds() / (60*60*24)) 

members = members.drop(['registration_init_time', 'expiration_date'], axis = 1)

## 歌曲信息处理

In [4]:
def isrc_to_year(x):
    if type(x) == str:
        if int(x[5:7]) > 17:
            return 1900 + int(x[5:7])
        else:
            return 2000 + int(x[5:7])
    else:
        return np.nan

def isrc_to_country(x):
    if type(x) == str:
        return x[:2]
    else:
        return np.nan
    
songs_extra['songs_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra['songs_country'] = songs_extra['isrc'].apply(isrc_to_country)
songs_extra = songs_extra.drop(['isrc', 'name'],axis =1 )

## Mergining 

In [5]:
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

train = train.merge(songs_extra, on='song_id', how='left')
test = test.merge(songs_extra, on='song_id', how='left')

train['song_length'].fillna(200000, inplace=True)
test['song_length'].fillna(200000, inplace=True)

import gc
del members, songs, songs_extra;gc.collect()

133

## New Feature 

In [6]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)

def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'].fillna('no_composer',inplace=True)
test['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)


def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'].fillna('no_artist',inplace=True)
test['artist_name'].fillna('no_artist',inplace=True)

train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)

def artist_count(x):
    if x == 'no_artist':
        return 0
    elif x == '佚名':
        return 0
    elif x == '群星':
        return -1
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)

train['artist_composer'] = (train['artist_name'] == train['composer']).astype(np.int8)
test['artist_composer'] = (test['artist_name'] == test['composer']).astype(np.int8)

train['artist_composer_lyricist'] = ((train['artist_name'] == train['composer']) & (train['artist_name'] == train['lyricist']) & (train['composer'] == train['lyricist'])).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'] == test['composer']) & (test['artist_name'] == test['lyricist']) & (test['composer'] == test['lyricist'])).astype(np.int8)

In [7]:
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0


train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.int8)

_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0


train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)


# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)



## Trainning

In [None]:
print ("Train test and validation sets")
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

X_train = train.drop(['target'], axis=1)
y_train = train['target'].values


X_test = test.drop(['id'], axis=1)
ids = test['id'].values

del train, test; gc.collect()

m = X_test.shape[0]

d_train_final = lgb.Dataset(X_train[:-m], y_train[:-m])
watchlist_final = lgb.Dataset(X_train[-m:], y_train[-m:])

d_train = lgb.Dataset(X_train, y_train)

Train test and validation sets


In [None]:
params = {
    'objective': 'binary',
    'learning_rate': 0.3,
    'num_leaves': 226,
    'max_depth': 25,
    'num_rounds': 200,
    'metric' : 'auc',
    'lambda_l2': 0.5,
}

%time model_f1 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=1)



[1]	valid_0's auc: 0.647958
[2]	valid_0's auc: 0.654179
[3]	valid_0's auc: 0.657716


In [None]:
params = {
    'objective': 'binary',
    'learning_rate': 0.03,
    'num_leaves': 226,
    'max_depth': 25,
    'num_rounds': 200,
    'metric' : 'binary_logloss',
    'lambda_l2': 0.5,
}


model_f1 = lgb.train(params, train_set=d_train,  valid_sets=[d_train], verbose_eval=10)

In [None]:
p_test_1 = model_f1.predict(X_test)

In [12]:
submission = pd.read_csv(data_path + 'sample_submission.csv')
submission.target = p_test_1
submission.to_csv('xentropy7.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')