In [30]:
from sklearn import tree, metrics, ensemble
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
print(os.getcwd())

/Users/isabelniu/Documents/HKUST-20Spring/ML/Project


In [57]:
#read data
print('start reading...')
prefix = "/Users/isabelniu/Documents/HKUST-20Spring/ML/Project/DataSet/KKBOX/"
train = pd.read_csv(prefix + "train.csv")
test = pd.read_csv(prefix + "test.csv")
songs = pd.read_csv(prefix + "songs.csv")
#song_extra = pd.read_csv(prefix + "song_extra_info.csv")
members = pd.read_csv(prefix + "members.csv")

print('done reading')

start reading...
done reading


In [58]:
#merge song info with training data using song id
print('start merging...')
train = pd.merge(train, songs, on='song_id', how='left')
test = pd.merge(test, songs, on='song_id', how='left')
del songs

#merge user info with training data using user id
train = pd.merge(train, members, on='msno', how='left')
test = pd.merge(test, members, on='msno', how='left')
del members

print('done merging')

start merging...
done merging


In [59]:
train.shape

(7377418, 18)

In [62]:
test.shape

(2556790, 18)

In [63]:
train.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language', 'city', 'bd', 'gender',
       'registered_via', 'registration_init_time', 'expiration_date'],
      dtype='object')

In [64]:
#count null values
train.isnull().sum()
#df.isnull().sum()/df.isnull().count()*100

msno                            0
song_id                         0
source_system_tab           24849
source_screen_name         414804
source_type                 21539
target                          0
song_length                   114
genre_ids                  118455
artist_name                   114
composer                  1675706
lyricist                  3178798
language                      150
city                            0
bd                              0
gender                    2961479
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64

In [65]:
test.isnull().sum()

id                              0
msno                            0
song_id                         0
source_system_tab            8442
source_screen_name         162883
source_type                  7297
song_length                    25
genre_ids                   42110
artist_name                    25
composer                   619304
lyricist                  1224744
language                       42
city                            0
bd                              0
gender                    1052224
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64

In [66]:
train.dtypes

msno                       object
song_id                    object
source_system_tab          object
source_screen_name         object
source_type                object
target                      int64
song_length               float64
genre_ids                  object
artist_name                object
composer                   object
lyricist                   object
language                  float64
city                        int64
bd                          int64
gender                     object
registered_via              int64
registration_init_time      int64
expiration_date             int64
dtype: object

In [67]:
#set NA of objects to be 'unknown'
#set NA of integers and floats to be 0
#for i in train.select_dtypes(include=['object']).columns:
#    train[i][train[i].isnull()] = 'unknown'
#train = train.fillna(value=0)

In [68]:
#set NA of objects to be 'unknown'
#set NA of integers and floats to be 0
#for i in test.select_dtypes(include=['object']).columns:
#    test[i][test[i].isnull()] = 'unknown'
#test = test.fillna(value=0)
#print('NA filled.')

In [69]:
# creating three columns of month, year and day using extraction from registration_init_date
train.registration_init_time = pd.to_datetime(train.registration_init_time, format='%Y%m%d', errors='ignore')
train['registration_init_time_year'] = train['registration_init_time'].dt.year
train['registration_init_time_month'] = train['registration_init_time'].dt.month
train['registration_init_time_day'] = train['registration_init_time'].dt.day
test.registration_init_time = pd.to_datetime(test.registration_init_time, format='%Y%m%d', errors='ignore')
test['registration_init_time_year'] = test['registration_init_time'].dt.year
test['registration_init_time_month'] = test['registration_init_time'].dt.month
test['registration_init_time_day'] = test['registration_init_time'].dt.day


# creating three columns of month, year and day using extraction from expiration_date
train.expiration_date = pd.to_datetime(train.expiration_date,  format='%Y%m%d', errors='ignore')
train['expiration_date_year'] = train['expiration_date'].dt.year
train['expiration_date_month'] = train['expiration_date'].dt.month
train['expiration_date_day'] = train['expiration_date'].dt.day
test.expiration_date = pd.to_datetime(test.expiration_date,  format='%Y%m%d', errors='ignore')
test['expiration_date_year'] = test['expiration_date'].dt.year
test['expiration_date_month'] = test['expiration_date'].dt.month
test['expiration_date_day'] = test['expiration_date'].dt.day

In [70]:
# convert date items and non-numeric items into categorical values
#train['registration_init_time'] = train['registration_init_time'].astype('category')
#train['expiration_date'] = train['expiration_date'].astype('category')
#for col in train.select_dtypes(include=['object']).columns:
#    train[col] = train[col].astype('category')

In [71]:
#do the same for test data
#test['registration_init_time'] = test['registration_init_time'].astype('category')
#test['expiration_date'] = test['expiration_date'].astype('category')
#for col in test.select_dtypes(include=['object']).columns:
#    test[col] = test[col].astype('category')

In [72]:
train.dtypes

msno                                    object
song_id                                 object
source_system_tab                       object
source_screen_name                      object
source_type                             object
target                                   int64
song_length                            float64
genre_ids                               object
artist_name                             object
composer                                object
lyricist                                object
language                               float64
city                                     int64
bd                                       int64
gender                                  object
registered_via                           int64
registration_init_time          datetime64[ns]
expiration_date                 datetime64[ns]
registration_init_time_year              int64
registration_init_time_month             int64
registration_init_time_day               int64
expiration_da

In [73]:
test.dtypes

id                                       int64
msno                                    object
song_id                                 object
source_system_tab                       object
source_screen_name                      object
source_type                             object
song_length                            float64
genre_ids                               object
artist_name                             object
composer                                object
lyricist                                object
language                               float64
city                                     int64
bd                                       int64
gender                                  object
registered_via                           int64
registration_init_time          datetime64[ns]
expiration_date                 datetime64[ns]
registration_init_time_year              int64
registration_init_time_month             int64
registration_init_time_day               int64
expiration_da

In [74]:
all_data = pd.concat([train, test])

In [75]:
all_data.isnull().sum()

artist_name                         139
bd                                    0
city                                  0
composer                        2295010
expiration_date                       0
expiration_date_day                   0
expiration_date_month                 0
expiration_date_year                  0
gender                          4013703
genre_ids                        160565
id                              7377418
language                            192
lyricist                        4403542
msno                                  0
registered_via                        0
registration_init_time                0
registration_init_time_day            0
registration_init_time_month          0
registration_init_time_year           0
song_id                               0
song_length                         139
source_screen_name               577687
source_system_tab                 33291
source_type                       28836
target                          2556790


In [76]:
# encoding for categorical values on training and test data set
#for col in all_data.select_dtypes(include=['category']).columns:
#    all_data[col] = all_data[col].cat.codes
#train = train.drop(['expiration_date', 'lyricist'], 1)

In [77]:
enc = LabelEncoder()

for col in [
    'msno', 'song_id', 'source_screen_name', 
    'source_system_tab', 'source_type', 'genre_ids', 
    'artist_name', 'composer', 'lyricist', 'gender'
]:
    all_data[col] = enc.fit_transform(all_data[col].fillna('nan'))
    
for col in ['language', 'city', 'registered_via','song_length']:
    all_data[col] = enc.fit_transform(all_data[col].fillna(-2))
    

In [78]:
#display(all_data[:10])
all_data.dtypes

artist_name                              int64
bd                                       int64
city                                     int64
composer                                 int64
expiration_date                 datetime64[ns]
expiration_date_day                      int64
expiration_date_month                    int64
expiration_date_year                     int64
gender                                   int64
genre_ids                                int64
id                                     float64
language                                 int64
lyricist                                 int64
msno                                     int64
registered_via                           int64
registration_init_time          datetime64[ns]
registration_init_time_day               int64
registration_init_time_month             int64
registration_init_time_year              int64
song_id                                  int64
song_length                              int64
source_screen

In [1]:
all_data.isnull().sum()

NameError: name 'all_data' is not defined

In [84]:
#redefine train data and test data
n = len(train)
train_data = all_data[:n]
test_data = all_data[n:]

In [85]:
print(train_data.shape,test_data.shape)

(7377418, 25) (2556790, 25)


In [86]:
train_data= train_data.drop(['id'], 1)
test_data= test_data.drop(['target'], 1)


In [87]:
print(train_data.isnull().sum(),test_data.isnull().sum())

artist_name                     0
bd                              0
city                            0
composer                        0
expiration_date                 0
expiration_date_day             0
expiration_date_month           0
expiration_date_year            0
gender                          0
genre_ids                       0
language                        0
lyricist                        0
msno                            0
registered_via                  0
registration_init_time          0
registration_init_time_day      0
registration_init_time_month    0
registration_init_time_year     0
song_id                         0
song_length                     0
source_screen_name              0
source_system_tab               0
source_type                     0
target                          0
dtype: int64 artist_name                     0
bd                              0
city                            0
composer                        0
expiration_date                 0
e

In [88]:
train_data.to_csv(prefix +'train_data.csv')
test_data.to_csv(prefix +'test_data.csv')

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                         intercept_scaling=1, class_weight=None, random_state=None, 
                         solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=1)
model.fit(train[train.columns[train.columns != 'target']], train[[target]])

In [None]:
Toprediction = np.zeros(shape=[len(test)])
Toprediction+=model.predict(test.drop(['id'],axis=1))

In [None]:
from sklearn.metrics import roc_curve, auc
X.test = test.drop(['target'], axis=1)
predictions_roc = model.predict(X.test)

FP, TP, thresholds = roc_curve(test['target'], predictions_roc)
roc_auc = auc(FP, TP)
roc_auc
plt.plot(FP, TP)

In [None]:
test.columns

In [65]:
#train1=train
#train1.shape

(7377418, 24)

In [83]:
#standarize all numeric columns except target first
#x=train1.loc[:,train1.columns != 'target']
#y=train1.loc[:,'target']
#for col in x.select_dtypes(exclude=['category']).columns:
#    x[col] = StandardScaler().fit_transform(x[[col]])

In [30]:
#implementing PCA to variables except target
#pca = PCA(n_components=2)
#pca.fit(train[train.columns[train.columns != 'target']])

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [None]:
print('Making predictions')

print