In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn import metrics
from sklearn.metrics import accuracy_score
import re
import random
import gc
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [21]:
print('start reading...')
prefix = "/Users/isabelniu/Documents/HKUST-20Spring/ML/Project/CompleteDataSet/KKBOX/"
train = pd.read_csv(prefix+'train.csv', dtype={'msno' : 'object',
                                                 'source_system_tab' : 'object',
                                                 'source_screen_name' : 'object',
                                                 'source_type' : 'object',
                                                 'target' : np.uint8,
                                                 'song_id' : 'object'})
test = pd.read_csv(prefix+'test.csv', dtype={'msno' : 'object',
                                               'source_system_tab' : 'object',
                                               'source_screen_name' : 'object',
                                               'source_type' : 'object',
                                               'song_id' : 'object'})
songs = pd.read_csv(prefix+'songs.csv',dtype={'genre_ids': 'object',
                                                    'language' : 'object',
                                                    'artist_name' : 'object',
                                                    'composer' : 'object',
                                                    'lyricist' : 'object',
                                                    'song_id' : 'object'})
members = pd.read_csv(prefix+'members.csv',dtype={'city' : 'object',
                                                    'bd' : np.uint8,
                                                    'gender' : 'object',
                                                    'registered_via' : 'object'},
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(prefix+'song_extra_info.csv')
print('done reading')

start reading...
done reading


In [22]:
#convert object type to categories
def object2cat(df):
    object_cols = list(df.select_dtypes(include=['object']).columns)
    for col in object_cols:
        df[col]=df[col].astype('category')
object2cat(train)
object2cat(test)
object2cat(songs)
object2cat(members)

In [23]:
#test case for object2cat thus make sure we are getting correct column types
assert train['msno'].dtype == 'category'
assert test['source_system_tab'].dtype == 'category'
assert songs['language'].dtype == 'category'
assert members['city'].dtype == 'category'

In [24]:
songs_extra.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


In [25]:
#we can extract year from isrc
print('Data merging...')

# Merging song with train and test dataframes
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

# Data Extraction/Transformation - Extracting days, 
# months and years from expiratin and registration time 
members['membership_days'] = members['expiration_date'].subtract(
    members['registration_init_time']).dt.days.astype(int)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day
members = members.drop(['registration_init_time'], axis=1)


# Converting isrc id to year
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
    

#extracting the year of the song from isrc        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)

#Dropping isrc and name from songs_extra
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

train = train.merge(members, on='msno', how='left')
train.msno = train.msno.astype('category')
test = test.merge(members, on='msno', how='left')
test.msno = test.msno.astype('category')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')


test = test.merge(songs_extra, on = 'song_id', how = 'left')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')

#deleting redundant dataframes

del members, songs
gc.collect();

print('Done merged')

Data merging...
Done merged


In [26]:
train.dtypes

msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
song_length             uint32
genre_ids             category
artist_name           category
composer              category
lyricist              category
language              category
city                  category
bd                       uint8
gender                category
registered_via        category
expiration_date          int64
membership_days          int64
registration_year        int64
registration_month       int64
registration_date        int64
expiration_year          int64
expiration_month         int64
song_year              float64
dtype: object

In [None]:
#Add category and fill NA
train['genre_ids']=train['genre_ids'].cat.add_categories('no_genre_id')
train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids']=test['genre_ids'].cat.add_categories('no_genre_id')
test['genre_ids'].fHillna('no_genre_id',inplace=True)

train['lyricist']=train['lyricist'].cat.add_categories('no_lyricist')
train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist']=test['lyricist'].cat.add_categories('no_lyricist')
test['lyricist'].fillna('no_lyricist',inplace=True)

train['composer']=train['composer'].cat.add_categories('no_composer')
train['composer'].fillna('no_composer',inplace=True)
test['composer']=test['composer'].cat.add_categories('no_composer')
test['composer'].fillna('no_composer',inplace=True)

train['artist_name']=train['artist_name'].cat.add_categories('no_artist')
train['artist_name'].fillna('no_artist',inplace=True)
test['artist_name']=test['artist_name'].cat.add_categories('no_artist')
test['artist_name'].fillna('no_artist',inplace=True)

In [None]:
#add new features
#counting genre_id
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)


# Splitting the lyricists by ['|', '/', '\\', ';'] and counting the number of Lyricists
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)


# Splitting the comoser by ['|', '/', '\\', ';'] and counting the number of Lyricists
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)

# Checking for feat in the column value
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)

# Splitting the artists by [and, ',', feat, &] and counting the number of artists
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)
print('finished adding genre id count, composer count, lyricist count, feated artist and artist count')

In [None]:
#testing features

#testing genre_id_count
assert len(train['genre_ids_count']) != 0
assert len(test['genre_ids_count']) != 0

#testing composer_count
assert len(train['composer_count']) != 0
assert len(train['composer_count']) != 0

#testing lyricist_count
assert len(train['lyricists_count']) != 0
assert len(test['lyricists_count']) != 0

#testing is_feat
assert len(train['is_featured']) != 0
assert len(test['is_featured']) != 0

#testing artist_count
assert len(train['artist_count']) != 0
assert len(test['artist_count']) != 0

In [None]:
# if artist is same as composer
train['artist_composer'] = (train['artist_name'].astype("object") == train['composer']
                            .astype("object")).astype(np.int8)
test['artist_composer'] = (test['artist_name'].astype("object") == test['composer']
                           .astype("object")).astype(np.int8)

In [None]:
# if artist, lyricist and composer are all the same
train['artist_composer_lyricist'] = ((train['artist_name'].astype("object") 
                                      == train['composer'].astype("object")) 
                                     & (train['artist_name'].astype("object")
                                        == train['lyricist'].astype("object"))
                                     & (train['composer'].astype("object")
                                        == train['lyricist'].astype("object"))
                                    ).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'].astype("object") 
                                     == test['composer'].astype("object")) 
                                    & (test['artist_name'].astype("object")
                                       == test['lyricist'].astype("object"))
                                    & (test['composer'].astype("object") 
                                       == test['lyricist'].astype("object"))
                                   ).astype(np.int8)



In [None]:
train.isnull().sum()

In [None]:
train['language']=train['language'].cat.add_categories('0')
train['language'].fillna('0',inplace=True)
test['language']=test['language'].cat.add_categories('0')
test['language'].fillna('0',inplace=True)

In [None]:
# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.uint8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.uint8)

In [None]:
_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id']
                                 .value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id']
                                .value_counts().iteritems()}

def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    
train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name']
                                   .value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name']
                                  .value_counts().iteritems()}

def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)


print("Done adding features")

In [None]:
#testing the functions

#testing song_lang_boolean
assert train['song_lang_boolean'][0] == 0
assert train['song_lang_boolean'][58] == 1

#testing smaller_song
assert len(test['smaller_song']) != 0
assert len(train['smaller_song']) != 0

#testing count_song_played
assert 'count_song_played' in train
assert len(train['count_artist_played']) != 0
assert 'count_song_played' in test
assert len(test['count_artist_played']) != 0

In [None]:
del songs_extra

In [None]:
train.to_csv("CompleteDataSet/processed_train_1.csv") #Saving file to local

In [None]:
test.to_csv("CompleteDataSet/processed_test_1.csv") #Saving test file to local

In [4]:
prefix = "/Users/isabelniu/Documents/HKUST-20Spring/ML/Project/CompleteDataSet/"
train = pd.read_csv(prefix + "processed_train_1.csv")
test = pd.read_csv(prefix + "processed_test_1.csv")

In [5]:
train=train.drop(train.columns[0],1)
test=test.drop(test.columns[0],1)

In [6]:
all_data = pd.concat([train, test])

In [7]:
all_data.isnull().sum()

artist_composer                   0
artist_composer_lyricist          0
artist_count                      0
artist_name                       0
bd                                0
city                              0
composer                          0
composer_count                    0
count_artist_played               0
count_song_played                 0
expiration_date                   0
expiration_month                  0
expiration_year                   0
gender                      4013703
genre_ids                         0
genre_ids_count                   0
id                          7377418
is_featured                       0
language                          0
lyricist                          0
lyricists_count                   0
membership_days                   0
msno                              0
registered_via                    0
registration_date                 0
registration_month                0
registration_year                 0
smaller_song                

In [8]:
# Imputing continuous columns
na_continuous_col = ['song_year','gender']
na_category_col=['source_system_tab',
                     'source_screen_name', 'source_type']
for col in na_continuous_col:
    all_data[col] = all_data[col].fillna(-1)
for col in na_category_col:
    all_data[col] = all_data[col].fillna('NaN')

In [9]:
# Label encoding for the categorical varaibles 
def encoder(x_train):
    le = preprocessing.LabelEncoder()
    for column_name in x_train.columns:
            if x_train[column_name].dtype.name in ['category','object']:
                x_train[column_name] = le.fit_transform(
                    x_train[column_name].astype(str))

# Encode the data
encoder(all_data)

In [10]:
#redefine train data and test data
n = len(train)
train = all_data[:n]
test = all_data[n:]

In [14]:
del all_data
gc.collect()

20

In [11]:
print(train.shape,test.shape)

(7377418, 36) (2556790, 36)


In [12]:
train= train.drop(['id'], 1)
test= test.drop(['target'], 1)

In [15]:
train.to_csv("CompleteDataSet/FinalEncodedTrain.csv") #Saving file to local

In [16]:
test.to_csv("CompleteDataSet/FinalEncodedTest.csv") #Saving file to local