In [1]:
# https://www.kaggle.com/asmitavikas/feature-engineered-0-68310
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math

In [3]:
print('Loading data...')
data_path = './kaggle/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv',
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
print('Done loading...')

Loading data...
Done loading...


In [5]:
# Convert to categorical
def to_categorical(X):
    print("Before convert to categorical")
    print(X.info())
    X_cat = pd.concat([
        X.select_dtypes([], ['object']),
        X.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex_axis(X.columns, axis=1)
    print("After convertion")
    print(X_cat.info())
    return X_cat

In [7]:
train_cat=to_categorical(train)
test_cat=to_categorical(test)
songs_cat=to_categorical(songs)
members_cat=to_categorical(members)

Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                int64
dtypes: int64(1), object(5)
memory usage: 337.7+ MB
None
After convertion
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                int64
dtypes: category(5), int64(1)
memory usage: 122.6 MB
None
Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556790 entries, 0 to 2556789
Data columns (total 6 columns):
id                    int64
msno                  object
song_id               object
source_system_tab     object
s

In [8]:
# Merge songs
train_cat = train_cat.merge(songs, on='song_id', how='left')
test_cat = test_cat.merge(songs, on='song_id', how='left')

In [9]:
# Engineering on members
members_cat['membership_days'] = members_cat['expiration_date'].subtract(members_cat['registration_init_time']).dt.days.astype(int)

members_cat['registration_year'] = members_cat['registration_init_time'].dt.year
members_cat['registration_month'] = members_cat['registration_init_time'].dt.month
members_cat['registration_date'] = members_cat['registration_init_time'].dt.day

members_cat['expiration_year'] = members_cat['expiration_date'].dt.year
members_cat['expiration_month'] = members_cat['expiration_date'].dt.month
members_cat['expiration_date'] = members_cat['expiration_date'].dt.day
members_cat = members_cat.drop(['registration_init_time'], axis=1)

In [11]:
# Merge members
train_cat = train_cat.merge(members_cat, on='msno', how='left')
test_cat = test_cat.merge(members_cat, on='msno', how='left')

In [10]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [12]:
# Process songs_extra
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

In [13]:
# Merge songs_extra
train_cat = train_cat.merge(songs_extra, on = 'song_id', how = 'left')
train_cat.song_length.fillna(200000,inplace=True)
train_cat.song_length = train_cat.song_length.astype(np.uint32)
train_cat.song_id = train_cat.song_id.astype('category')


test_cat = test_cat.merge(songs_extra, on = 'song_id', how = 'left')
test_cat.song_length.fillna(200000,inplace=True)
test_cat.song_length = test_cat.song_length.astype(np.uint32)
test_cat.song_id = test_cat.song_id.astype('category')

In [14]:
train_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 24 columns):
msno                  object
song_id               category
source_system_tab     object
source_screen_name    object
source_type           object
target                int64
song_length           uint32
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
city                  int64
bd                    int64
gender                object
registered_via        int64
expiration_date       int64
membership_days       int64
registration_year     int64
registration_month    int64
registration_date     int64
expiration_year       int64
expiration_month      int64
song_year             float64
dtypes: category(1), float64(2), int64(11), object(9), uint32(1)
memory usage: 1.3+ GB


In [15]:
test_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 24 columns):
id                    int64
msno                  object
song_id               category
source_system_tab     object
source_screen_name    object
source_type           object
song_length           uint32
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
city                  int64
bd                    int64
gender                object
registered_via        int64
expiration_date       int64
membership_days       int64
registration_year     int64
registration_month    int64
registration_date     int64
expiration_year       int64
expiration_month      int64
song_year             float64
dtypes: category(1), float64(2), int64(11), object(9), uint32(1)
memory usage: 469.9+ MB


In [16]:
# Convert to categorical due to merging songs_extra
train_1=to_categorical(train_cat)
test_1=to_categorical(test_cat)

Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 24 columns):
msno                  object
song_id               category
source_system_tab     object
source_screen_name    object
source_type           object
target                int64
song_length           uint32
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
city                  int64
bd                    int64
gender                object
registered_via        int64
expiration_date       int64
membership_days       int64
registration_year     int64
registration_month    int64
registration_date     int64
expiration_year       int64
expiration_month      int64
song_year             float64
dtypes: category(1), float64(2), int64(11), object(9), uint32(1)
memory usage: 1.3+ GB
None
After convertion
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 e

In [17]:
# Output to csv
train_1.to_csv(data_path+"train_engineered.csv")
test_1.to_csv(data_path+"test_engineered.csv")

Phase 1 feature engineering:
0.66770 without cv

# # Phase 2 feature engineering
# See if there will be improvements

In [18]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

In [19]:
train_1['genre_ids'].fillna('no_genre_id',inplace=True)
test_1['genre_ids'].fillna('no_genre_id',inplace=True)
train_1['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test_1['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

ValueError: fill value must be in categories

In [None]:
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

In [None]:
train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)