In [3]:
# https://www.kaggle.com/asmitavikas/feature-engineered-0-68310
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math

In [4]:
print('Loading data...')
data_path = './raw/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv',
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
print('Done loading...')

Loading data...
Done loading...


In [5]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        object
song_length    int64
genre_ids      object
artist_name    object
composer       object
lyricist       object
language       float64
dtypes: float64(1), int64(1), object(5)
memory usage: 122.6+ MB


In [6]:
songs['genre_ids']

0                      465
1                      444
2                      465
3                      465
4                      726
5          864|857|850|843
6                      458
7                      465
8                      465
9                 352|1995
10                    2157
11                     465
12                     726
13                     458
14                     359
15                     359
16                     458
17                     465
18                     726
19                     465
20                     465
21                     465
22                     465
23                     465
24                    1609
25                    1609
26                     465
27                     139
28                    1609
29                     465
                ...       
2296290               1259
2296291                880
2296292                465
2296293                958
2296294                465
2296295        139|125|109
2

In [7]:
songs['artist_name']

0                                           張信哲 (Jeff Chang)
1                                                  BLACKPINK
2                                               SUPER JUNIOR
3                                                      S.H.E
4                                                       貴族精選
5                                                       貴族精選
6                                            伍佰 & China Blue
7                                          光良 (Michael Wong)
8                                               林俊傑 (JJ Lin)
9                                                   Kodaline
10                                       D.L 羅時豐 (Daniel Lo)
11                                                  白安 (Ann)
12                                                Littlesong
13                                             蔡旻佑 (Evan Yo)
14                                                  Coldplay
15                                             Maggie Rogers
16                      

In [8]:
# Convert to lower case
# NaN values are kept for replacement
songs['artist_name']=songs['artist_name'].str.lower()
songs['composer']=songs['composer'].str.lower()
songs['lyricist']=songs['lyricist'].str.lower()

In [9]:
# Convert to categorical
def to_categorical(X):
    print("Before convert to categorical")
    print(X.info())
    X_cat = pd.concat([
        X.select_dtypes([], ['object']),
        X.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex_axis(X.columns, axis=1)
    print("After convertion")
    print(X_cat.info())
    return X_cat

In [10]:
train_cat=to_categorical(train)
test_cat=to_categorical(test)
songs_cat=to_categorical(songs)
members_cat=to_categorical(members)

Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                int64
dtypes: int64(1), object(5)
memory usage: 337.7+ MB
None


  


After convertion
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                int64
dtypes: category(5), int64(1)
memory usage: 133.8 MB
None
Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556790 entries, 0 to 2556789
Data columns (total 6 columns):
id                    int64
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
dtypes: int64(1), object(5)
memory usage: 117.0+ MB
None
After convertion
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556790 entries, 0 to 2556789
Data columns (total 6 columns):
id                    int64
msno                  category
song_id               category
source_system_tab     category
source_s

In [11]:
# Merge songs
train_cat = train_cat.merge(songs, on='song_id', how='left')
test_cat = test_cat.merge(songs, on='song_id', how='left')

In [12]:
# Engineering on members
members_cat['membership_days'] = members_cat['expiration_date'].subtract(members_cat['registration_init_time']).dt.days.astype(int)

members_cat['registration_year'] = members_cat['registration_init_time'].dt.year
members_cat['registration_month'] = members_cat['registration_init_time'].dt.month
members_cat['registration_date'] = members_cat['registration_init_time'].dt.day

members_cat['expiration_year'] = members_cat['expiration_date'].dt.year
members_cat['expiration_month'] = members_cat['expiration_date'].dt.month
members_cat['expiration_date'] = members_cat['expiration_date'].dt.day
members_cat = members_cat.drop(['registration_init_time'], axis=1)

In [8]:
# Merge members
train_cat = train_cat.merge(members_cat, on='msno', how='left')
test_cat = test_cat.merge(members_cat, on='msno', how='left')

In [9]:
train_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 23 columns):
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                int64
song_length           float64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
city                  int64
bd                    int64
gender                object
registered_via        int64
expiration_date       int64
membership_days       int64
registration_year     int64
registration_month    int64
registration_date     int64
expiration_year       int64
expiration_month      int64
dtypes: float64(2), int64(11), object(10)
memory usage: 1.3+ GB


In [10]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [11]:
songs_extra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295971 entries, 0 to 2295970
Data columns (total 3 columns):
song_id    object
name       object
isrc       object
dtypes: object(3)
memory usage: 52.6+ MB


In [12]:
# Process songs_extra
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

In [13]:
# Merge songs_extra
train_cat = train_cat.merge(songs_extra, on = 'song_id', how = 'left')
train_cat.song_length.fillna(200000,inplace=True)
train_cat.song_length = train_cat.song_length.astype(np.uint32)
train_cat.song_id = train_cat.song_id.astype('category')


test_cat = test_cat.merge(songs_extra, on = 'song_id', how = 'left')
test_cat.song_length.fillna(200000,inplace=True)
test_cat.song_length = test_cat.song_length.astype(np.uint32)
test_cat.song_id = test_cat.song_id.astype('category')

In [13]:
train_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 12 columns):
msno                  category
song_id               object
source_system_tab     category
source_screen_name    category
source_type           category
target                int64
song_length           float64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
dtypes: category(4), float64(2), int64(1), object(5)
memory usage: 543.2+ MB


In [14]:
test_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 12 columns):
id                    int64
msno                  category
song_id               object
source_system_tab     category
source_screen_name    category
source_type           category
song_length           float64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
dtypes: category(4), float64(2), int64(1), object(5)
memory usage: 188.6+ MB


In [15]:
# Convert to categorical due to merging songs_extra
train_1=to_categorical(train_cat)
test_1=to_categorical(test_cat)

Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 12 columns):
msno                  category
song_id               object
source_system_tab     category
source_screen_name    category
source_type           category
target                int64
song_length           float64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
dtypes: category(4), float64(2), int64(1), object(5)
memory usage: 543.2+ MB
None


  


After convertion
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 12 columns):
msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                int64
song_length           float64
genre_ids             category
artist_name           category
composer              category
lyricist              category
language              float64
dtypes: category(9), float64(2), int64(1)
memory usage: 407.4 MB
None
Before convert to categorical
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 12 columns):
id                    int64
msno                  category
song_id               object
source_system_tab     category
source_screen_name    category
source_type           category
song_length           float64
genre_ids             object
artist_name           object
composer             

In [16]:
# Output to csv
train_1.to_csv("train_engineered.csv")
test_1.to_csv("test_engineered.csv")

Phase 1 feature engineering:
0.66770 without cv

# # Phase 2 feature engineering
# See if there will be improvements

In [29]:
# Copy to new start
train_2=train_1.copy()
test_2=test_1.copy()

In [27]:
# GC old df
import gc
del train, test
del songs, members
gc.collect()

NameError: name 'train' is not defined

In [30]:
def genre_id_count(x):
    if x == NO_GENRE_FILL:
        return 0
    else:
        return x.count('|') + 1

In [31]:
NO_GENRE_FILL="empty"
# this add_categories can only be run once
train_2['genre_ids']=train_2['genre_ids'].cat.add_categories([NO_GENRE_FILL])
test_2['genre_ids']=test_2['genre_ids'].cat.add_categories([NO_GENRE_FILL])

train_2['genre_ids'].fillna(NO_GENRE_FILL,inplace=True)
test_2['genre_ids'].fillna(NO_GENRE_FILL,inplace=True)
train_2['genre_ids_count'] = train_2['genre_ids'].apply(genre_id_count).astype(np.int8)
test_2['genre_ids_count'] = test_2['genre_ids'].apply(genre_id_count).astype(np.int8)

In [32]:
def lyricist_count(x):
    if x == NO_LYRICIST_FILL:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

In [36]:
train_2['lyricist'].cat.categories

Index([' ', ' al stillman|robert allen ', ' bruno martino|bruno brighetti',
       ' don black', ' ivan lins',
       ' korean lyrics by 100%seojung (12.5%) albi albertsson',
       ' korean lyrics by cho| yun kyoung (12.5%) albi albertsson / yuka otsuki / fabian strangl',
       ' korean lyrics by cho| yun kyoung (12.5%) command freaks / andreas stone johansson',
       ' korean lyrics by cho| yun kyoung (12.5%) paul 'marz' thompson (17%) / noah nwachukwu (14.32%) / adrian mckinnon / jeremy tyrone jasper / leven kali / jamil "digi" chammas / otha ‘vakseen' davis iii (3.58%) / mzmc (0.9%)',
       ' korean lyrics by cho| yun kyoung (5%) / january 8th (2%) / kim| dong hyun (3%) teddy riley| dom| lee| hyun seung for (trx) / j.sol (jason j lopez) / dantae johnson',
       ...
       'ｔｅｒｒｙ　ｃａｌｌｉｅｒ', 'ｔｈｅ　ｓｕｐｅｒ　ｂａｌｌ', 'ｔｓｕｒｕ', 'ｕ－ｑｙｏ', 'ｲﾄｦｶｼ', 'ｹﾂﾒｲｼ',
       'ﾊﾔｼｹｲ', 'ﾋﾞｯｹﾌﾞﾗﾝｶ', 'ﾛｰﾄﾞｵﾌﾞﾒｼﾞｬｰ', 'empty'],
      dtype='object', length=33635)

In [38]:
NO_LYRICIST_FILL="empty"
# this add_categories can only be run once
train_2['lyricist']=train_2['lyricist'].cat.add_categories([NO_LYRICIST_FILL])
test_2['lyricist']=test_2['lyricist'].cat.add_categories([NO_LYRICIST_FILL])

train_2['lyricist'].fillna(NO_LYRICIST_FILL,inplace=True)
test_2['lyricist'].fillna(NO_LYRICIST_FILL,inplace=True)
train_2['lyricists_count'] = train_2['lyricist'].apply(lyricist_count).astype(np.int8)
test_2['lyricists_count'] = test_2['lyricist'].apply(lyricist_count).astype(np.int8)

In [39]:
def composer_count(x):
    if x == NO_COMPOSER_FILL:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

In [40]:
NO_COMPOSER_FILL="empty"
# this add_categories can only be run once
train_2['composer']=train_2['composer'].cat.add_categories([NO_COMPOSER_FILL])
test_2['composer']=test_2['composer'].cat.add_categories([NO_COMPOSER_FILL])

train_2['composer'].fillna(NO_COMPOSER_FILL,inplace=True)
test_2['composer'].fillna(NO_COMPOSER_FILL,inplace=True)
train_2['composer_count'] = train_2['composer'].apply(composer_count).astype(np.int8)
test_2['composer_count'] = test_2['composer'].apply(composer_count).astype(np.int8)

In [41]:
NO_ARTIST_FILL='empty'
# this add_categories can only be run once
train_2['artist_name']=train_2['artist_name'].cat.add_categories([NO_ARTIST_FILL])
test_2['artist_name']=test_2['artist_name'].cat.add_categories([NO_ARTIST_FILL])

train_2['artist_name'].fillna(NO_ARTIST_FILL,inplace=True)
test_2['artist_name'].fillna(NO_ARTIST_FILL,inplace=True)

In [48]:
# if artist is same as composer
train_2['artist_is_composer'] = (train_2['artist_name'] == train_2['composer']).astype(np.int8)
test_2['artist_is_composer'] = (test_2['artist_name'] == test_2['composer']).astype(np.int8)

TypeError: Categoricals can only be compared if 'categories' are the same

In [None]:
# if artist, lyricist and composer are all three same
train_2['artist_is_composer_is_lyricist'] = ((train_2['artist_name'] == train['composer']) & (train['artist_name'] == train['lyricist']) & (train['composer'] == train['lyricist'])).astype(np.int8)
test_2['artist_is_composer_is_lyricist'] = ((test_2['artist_name'] == test['composer']) & (test['artist_name'] == test['lyricist']) & (test['composer'] == test['lyricist'])).astype(np.int8)

In [42]:
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

In [43]:
# Find if the song is featured
train_2['is_featured'] = train_2['artist_name'].apply(is_featured).astype(np.int8)
test_2['is_featured'] = test_2['artist_name'].apply(is_featured).astype(np.int8)

In [44]:
# Look at artist names that contains 'feat'
check_feat=train_2.loc[train_2['is_featured'] == 1]
check_feat['artist_name']

207                alex goot feat. kurt hugo schneider & atc
2217                            mark ronson feat. bruno mars
4291              a great big world feat. christina aguilera
5702                         madilyn bailey feat. josh evans
5824                            mark ronson feat. bruno mars
6139                               doriko feat. hatsune miku
6325                            mark ronson feat. bruno mars
6530                           ruslan-set featuring eva kade
6574                            mark ronson feat. bruno mars
6806                                       madeon feat. kyan
7323                            the chainsmokers feat. rozes
7511                            mark ronson feat. bruno mars
8089                            mark ronson feat. bruno mars
8099                        armin van buuren feat. mr. probz
9074                            mark ronson feat. bruno mars
9348                            the chainsmokers feat. rozes
10144                   

In [45]:
def feat_artist_name(x):
    if 'feat.' in str(x) :
        idx=x.find('feat.')
        feat_name=x[idx+5:].strip()
#         print("Feating names %s from %s" % (feat_name, x))
        return feat_name
    elif 'feat' in str(x) :
        idx=x.find('feat')
        feat_name=x[idx+5:].strip()
#         print("Feating names %s from %s" % (feat_name, x))
        return feat_name
    return "no_feat"

In [46]:
def main_artist_name(x):
    if 'feat' in str(x) :
        idx=x.find('feat.')
        main_name=x[:idx].strip()
#         print("Main artist name %s from %s" % (main_name, x))
        return main_name

    return x

In [47]:
# Split featuring artist names
train_2['feat_artist'] = train_2['artist_name'].apply(feat_artist_name)
test_2['feat_artist'] = test_2['artist_name'].apply(feat_artist_name)

train_2['main_artist'] = train_2['artist_name'].apply(main_artist_name)
test_2['main_artist'] = test_2['artist_name'].apply(main_artist_name)


In [106]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

In [107]:
train_2['artist_count'] = train_2['artist_name'].apply(artist_count).astype(np.int8)
test_2['artist_count'] = test_2['artist_name'].apply(artist_count).astype(np.int8)

In [112]:
def is_remix(x):
    if 'remix' in str(x):
        return 1
    return 0

In [113]:
train_2['is_remix']=train_2['artist_name'].apply(is_remix)
test_2['is_remix']=test_2['artist_name'].apply(is_remix)

In [114]:
# Look at artist names that contains 'remix'
check_remix=train_2.loc[train_2['is_remix'] == 1]
check_remix['artist_name']

2901                        dj remix factory
26418                         群星 (remix 101)
30596      various artists (bhangra remixes)
38607                       dj remix factory
53650                         remix radio dj
55112      various artists (bhangra remixes)
110496                        remix radio dj
110499                   dance remix factory
110500                      dance remix inc.
112951                      dj remix factory
112952                      dj remix factory
112954                      dj remix factory
112958                      dj remix factory
117139     various artists (bhangra remixes)
119215                      dj remix factory
119216                      dj remix factory
119222                      dj remix factory
127261                      dj remix factory
146015                      dj remix factory
146028                      dj remix factory
175439                        群星 (remix 101)
177061                 workout remix factory
180077    

In [118]:
def is_live(x):
    # some sings name start with 'live'
    if ' live' in str(x):
        return 1
    return 0

In [119]:
# if the song is live
train_2['is_live']=train_2['artist_name'].apply(is_live)
test_2['is_live']=test_2['artist_name'].apply(is_live)

In [120]:
# Look at artist names that contains 'remix'
check_live=train_2.loc[train_2['is_live'] == 1]
check_live['artist_name']

37025                 hillsong live
37027                 hillsong live
37028                 hillsong live
39879                 hillsong live
39880                 hillsong live
39881                 hillsong live
50863      romeo & juliette-en live
52756      romeo & juliette-en live
52788      romeo & juliette-en live
52792      romeo & juliette-en live
52818      romeo & juliette-en live
77410      romeo & juliette-en live
111619                hillsong live
126992     romeo & juliette-en live
129796        chuckie & jetlag live
141406               vh1 divas live
141415               vh1 divas live
141417               vh1 divas live
141418               vh1 divas live
141420               vh1 divas live
141423               vh1 divas live
141424               vh1 divas live
141429               vh1 divas live
141430               vh1 divas live
141435               vh1 divas live
141439               vh1 divas live
141445               vh1 divas live
141449               vh1 div

In [121]:
def is_acoustic(x):
    if 'acoustic' in str(x):
        return 1
    return 0

In [122]:
train_2['is_acoustic']=train_2['artist_name'].apply(is_acoustic)
test_2['is_acoustic']=test_2['artist_name'].apply(is_acoustic)

In [123]:
# Look at artist names that contains 'remix'
check_acoustic=train_2.loc[train_2['is_acoustic'] == 1]
check_acoustic['artist_name']

4080                             jacoustic
29654              10cm & acoustic collabo
44723                        the acoustics
101718                    vanilla acoustic
101719                    vanilla acoustic
101720                    vanilla acoustic
101721                    vanilla acoustic
105210                    vanilla acoustic
105211                    vanilla acoustic
105212                    vanilla acoustic
149393                       acoustic hits
165049                    vanilla acoustic
179507                    acoustic alchemy
206928             10cm & acoustic collabo
226086     acoustic guitar tribute players
226087     acoustic guitar tribute players
226089     acoustic guitar tribute players
226090     acoustic guitar tribute players
226092     acoustic guitar tribute players
227186     acoustic guitar tribute players
227188     acoustic guitar tribute players
227191     acoustic guitar tribute players
227192     acoustic guitar tribute players
228016     

In [127]:
def is_instrumental(x):
    if 'instrumental' in str(x) or 'instru' in str(x):
        return 1
    return 0

In [128]:
train_2['is_instrumental']=train_2['artist_name'].apply(is_instrumental)
test_2['is_instrumental']=test_2['artist_name'].apply(is_instrumental)

In [129]:
# Look at artist names that contains 'instrumental'
check_instrumental=train_2.loc[train_2['is_instrumental'] == 1]
check_instrumental['artist_name']

22075      piano music for christmas & instrumental chris...
22076      piano music for christmas & instrumental chris...
22077      piano music for christmas & instrumental chris...
41854                                the art of instrumental
46749                                the art of instrumental
46750                                the art of instrumental
65832      the mortal instruments: city of bones (origina...
79827                                           instrumental
107346     the mortal instruments: city of bones (origina...
108134                               the art of instrumental
110581     the mortal instruments: city of bones (origina...
110583     the mortal instruments: city of bones (origina...
110585     the mortal instruments: city of bones (origina...
153033     the mortal instruments: city of bones (origina...
165544     the mortal instruments: city of bones (origina...
174091         be still & know - instrumental songs of faith
185500                  

In [130]:
# Magic feature
# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train_2['song_lang_magic'] = train_2['language'].apply(song_lang_boolean).astype(np.int8)
test_2['song_lang_magic'] = test_2['language'].apply(song_lang_boolean).astype(np.int8)

In [134]:
# Song length
mean_song_length = np.mean(train_2['song_length'])
std_song_length = np.std(train_2['song_length'])
print("Song length mean: %f  std: %f" % (mean_song_length, std_song_length))

def shorter_song(x):
    if x < mean_song_length:
        return 1
    return 0

train_2['short_song'] = train_2['song_length'].apply(shorter_song).astype(np.int8)
test_2['short_song'] = test_2['song_length'].apply(shorter_song).astype(np.int8)

Song length mean: 245120.272126  std: 67344.419765


In [135]:
# How many std is the song length from mean
def mean_length_dist(x):
    num_std=(x-mean_song_length)/std_song_length
    return num_std

train_2['mean_length_distance'] = train_2['song_length'].apply(shorter_song).astype(np.float16)
test_2['mean_length_distance'] = test_2['song_length'].apply(shorter_song).astype(np.float16)

In [136]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train_2['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test_2['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train_2['count_song_played'] = train_2['song_id'].apply(count_song_played).astype(np.int64)
test_2['count_song_played'] = test_2['song_id'].apply(count_song_played).astype(np.int64)

In [137]:
# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train_2['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test_2['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train_2['count_artist_played'] = train_2['artist_name'].apply(count_artist_played).astype(np.int64)
test_2['count_artist_played'] = test_2['artist_name'].apply(count_artist_played).astype(np.int64)


End of phase 2


In [138]:
# Check categories
train_2['artist_name'].cat

<pandas.core.categorical.CategoricalAccessor object at 0x7fb2f250ada0>

In [139]:
# Output to csv
train_2.to_csv("train_engineered.csv")
test_2.to_csv("test_engineered.csv")

# Phase 3