In [1]:
import pickle as pkl
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from sklearn.decomposition import PCA

In [2]:
def read_from_pkl(path):
    with open(path,'rb') as f:
        data = pkl.load(f)
    return data

In [3]:
# load data
train_data1 = read_from_pkl('./data/train_19_features.pkl')
train_data2 = read_from_pkl('./data/train_13_features.pkl')
train_data1 = pd.DataFrame(train_data1).T
train_data2 = pd.DataFrame(train_data2).T
labels = train_data1.iloc[:,0]

In [4]:
train_data1.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
train-0,1,3595427,1040456,57,5,57,2,2,0,59,7,57,0.0,7.92515,0.0,0.0,114,59,0.0
train-1,1,3595427,3799692,57,5,57,12,4,12,69,9,69,0.825197,87.3779,0.0298507,0.132453,684,80,0.340852
train-2,1,3595427,3695000,57,5,57,2,2,0,59,7,57,0.0,7.92515,0.0,0.0,114,59,0.0
train-3,1,3595427,1629019,57,5,57,5942,351,5823,5999,356,5880,0.220624,42024.7,0.000166722,0.00706983,338694,6038,0.010989
train-4,1,3595427,983493,57,5,57,1,1,0,58,6,57,0.0,3.875,0.0,0.0,57,57,0.0


In [5]:
train1 = train_data1.iloc[:,3:]
train2 = train_data2.iloc[:,2:]
samples = pd.concat([train1,train2],axis=1,ignore_index=True)
samples.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
train-0,57,5,57,2,2,0,59,7,57,0.0,...,0.504525,1.70529,6355.5,18.4085,0.252094,0.322956,0.357477,0.0,230.316,0.0177552
train-1,57,5,57,12,4,12,69,9,69,0.825197,...,0.269746,0.915855,5785.5,9.28548,0.415673,0.0553785,0.0912004,0.264945,1381.89,0.108107
train-2,57,5,57,2,2,0,59,7,57,0.0,...,0.505882,1.73502,4902.0,18.4105,0.252614,0.32299,0.357548,0.0,230.316,0.017849
train-3,57,5,57,5942,351,5823,5999,356,5880,0.220624,...,0.00488729,0.025359,314247.0,0.105516,41.0383,0.00011231,0.0178255,0.0839517,684268.0,18.3549
train-4,57,5,57,1,1,0,58,6,57,0.0,...,1.0,3.37639,3249.0,36.3094,0.245246,0.637007,0.637007,0.0,115.158,0.0175439


In [27]:
samples.shape

(360000, 29)

In [28]:
samples = samples.astype(float)
samples[[0,1,2,3,4,5,6,7,8]] = samples[[0,1,2,3,4,5,6,7,8]].astype(int)
labels = labels.astype(int)
print('x:',samples.head(3))
print('y:',labels.head(3))

x:          0   1   2   3   4   5   6   7   8         9   ...        19  \
train-0  57   5  57   2   2   0  59   7  57  0.000000  ...  0.504525   
train-1  57   5  57  12   4  12  69   9  69  0.825197  ...  0.269746   
train-2  57   5  57   2   2   0  59   7  57  0.000000  ...  0.505882   

               20      21         22        23        24        25        26  \
train-0  1.705294  6355.5  18.408468  0.252094  0.322956  0.357477  0.000000   
train-1  0.915855  5785.5   9.285476  0.415673  0.055379  0.091200  0.264945   
train-2  1.735024  4902.0  18.410456  0.252614  0.322990  0.357548  0.000000   

                  27        28  
train-0   230.315789  0.017755  
train-1  1381.894737  0.108107  
train-2   230.315789  0.017849  

[3 rows x 29 columns]
y: train-0    1
train-1    1
train-2    1
Name: 0, dtype: int64


In [29]:
# split train & val
train_x,val_x,train_y,val_y = train_test_split(samples,labels,test_size=0.2)

In [30]:
print('train shape:',train_x.shape)
print('val shape:',val_x.shape)

train shape: (288000, 29)
val shape: (72000, 29)


In [31]:
# standardize
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
val_x = scaler.transform(val_x)
print(train_x[0])
print(val_x[0])

[-0.11164019 -0.14939821 -0.11212663 -0.06085698 -0.27140262 -0.05808053
 -0.10015842 -0.31003598 -0.09779108 -0.10904631 -0.03651876 -0.20830327
 -0.11213555 -0.03839756 -0.146      -0.06205586  0.19596413 -0.02681889
  0.03437082  0.55817056  0.08467001 -0.05026432 -0.01877235 -0.03352122
  0.00346433  0.03934303 -0.24867892 -0.0294674  -0.02739651]
[ 0.01868876  0.55001582  0.01767562 -0.0610257  -0.2827121  -0.05808053
 -0.04878763  0.02108129 -0.04647439 -0.11064301 -0.0365052  -0.34466983
 -0.19960538 -0.03836366 -0.14029504 -0.06209096  0.00666832  0.00802315
 -0.2368662  -0.42188529 -0.06703174  0.11466129 -0.01681455 -0.03397333
 -0.22056081 -0.48457407 -0.24867892 -0.02948031 -0.02741443]


In [13]:
# pca
pca = PCA(n_components=5)
pca.fit(train_x)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

[0.35696341 0.1713236  0.14767233 0.11179041 0.07933521]
[1282.53163168  888.51513545  824.90854689  717.72572937  604.62934056]


In [15]:
pca.components_

array([[ 0.12983203,  0.1071521 ,  0.12935959,  0.35038673,  0.24462974,
         0.34955101,  0.37396373,  0.265978  ,  0.37298887,  0.18147165,
         0.2724716 ,  0.02420308,  0.06857808,  0.27859793,  0.29857471,
         0.14556333],
       [-0.31595328, -0.17504413, -0.31573334, -0.0907299 , -0.008587  ,
        -0.09083099, -0.20732993, -0.09270781, -0.20732991,  0.43074322,
         0.29885782,  0.23340934,  0.2801096 ,  0.29036373,  0.07365915,
         0.40148436],
       [ 0.47219232,  0.40048368,  0.47014562, -0.30053705, -0.1220387 ,
        -0.30059687, -0.09237917,  0.08826466, -0.09322331,  0.22852966,
         0.0018651 ,  0.17135453,  0.21178961, -0.00291537,  0.112617  ,
         0.18942851],
       [ 0.11606904, -0.05558918,  0.11802052, -0.06629277, -0.30802219,
        -0.06301887, -0.01571504, -0.29628895, -0.0119295 ,  0.09843534,
         0.3061616 , -0.53637231, -0.51691563,  0.29925095,  0.09296438,
         0.14688024],
       [-0.13443292,  0.2138171 , -0

In [16]:
train_x = pca.transform(train_x)
val_x = pca.transform(val_x)

In [48]:
# modeling
model = Sequential()
model.add(Dense(128, input_dim=29, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 128)               3840      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 129       
Total params: 20,481
Trainable params: 20,481
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.fit(train_x, train_y,
          epochs=150,
          batch_size=128)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x7fbbcefaa880>

In [50]:
loss, acc = model.evaluate(val_x, val_y, verbose=False)
print('val acc:',acc)

val acc: 0.9991388916969299


In [51]:
# prediction
test_data1 = read_from_pkl('./data/test_19_features.pkl')
test_data2 = read_from_pkl('./data/test_13_features.pkl')
test_data1 = pd.DataFrame(test_data1).T
test_data2 = pd.DataFrame(test_data2).T

In [52]:
test_data1.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
1,,3563811,3600160,23,3,21,29,29,0,52,32,21,0.0,47.7849,0.0,0.0,667,27,0.0
2,,2052043,1401960,74,13,71,9,9,0,83,22,71,0.0,49.5286,0.0,0.0,666,31,0.0
3,,4517994,1690636,255,80,205,17,17,0,272,97,205,0.462508,356.919,0.0111524,0.0508183,4335,356,0.00462379
4,,1660006,4349447,511,32,506,36,36,0,547,68,506,0.178805,1223.15,0.00366972,0.0148185,18396,1847,2.80927e-05
5,,581111,1882617,21,5,18,46,46,0,67,51,18,0.0,75.4785,0.0,0.0,966,54,0.0


In [53]:
test_data2.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
1,3563811,3600160,0.0,0.166192,3.06738e-05,0.00059617,0.0113831,1013070.0,0.0234397,0.139785,0.00110861,0.00475634,0,7611.81,0.000973567
2,2052043,1401960,0.0,0.471572,0.000228642,0.00788548,0.0599194,191422.0,0.00362802,0.0680789,5.10988e-05,0.000454739,0,9233.75,0.00214473
3,4517994,1690636,0.000692042,3.45737,0.000186637,0.0242947,0.146893,1569870.0,0.117964,0.206271,0.000554185,0.0069107,0,12038.8,0.00319814
4,1660006,4349447,0.000108719,8.61822,7.29712e-06,0.00315172,0.082303,18462300.0,0.321158,0.336062,0.00063456,0.0142708,0,141540.0,0.000248849
5,581111,1882617,0.0,0.183293,9.72129e-07,2.04037e-05,0.0078426,1158810.0,0.00601201,0.235388,0.000329829,0.0063242,0,21287.8,4.65595e-05


In [54]:
test1 = test_data1.iloc[:,3:]
test2 = test_data2.iloc[:,2:]
test_x = pd.concat([test1,test2],axis=1,ignore_index=True)
test_x.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
1,23,3,21,29,29,0,52,32,21,0.0,...,0.00059617,0.0113831,1013070.0,0.0234397,0.139785,0.00110861,0.00475634,0,7611.81,0.000973567
2,74,13,71,9,9,0,83,22,71,0.0,...,0.00788548,0.0599194,191422.0,0.00362802,0.0680789,5.10988e-05,0.000454739,0,9233.75,0.00214473
3,255,80,205,17,17,0,272,97,205,0.462508,...,0.0242947,0.146893,1569870.0,0.117964,0.206271,0.000554185,0.0069107,0,12038.8,0.00319814
4,511,32,506,36,36,0,547,68,506,0.178805,...,0.00315172,0.082303,18462300.0,0.321158,0.336062,0.00063456,0.0142708,0,141540.0,0.000248849
5,21,5,18,46,46,0,67,51,18,0.0,...,2.04037e-05,0.0078426,1158810.0,0.00601201,0.235388,0.000329829,0.0063242,0,21287.8,4.65595e-05


In [55]:
test_x = scaler.transform(test_x)
#test_x = pca.transform(test_x)
print('test shape:',test_x.shape)
print(test_x[0])

test shape: (2000, 29)
[-0.12715788 -0.46869592 -0.12373016 -0.05984464 -0.2035457  -0.05808053
 -0.10536151 -0.40640593 -0.10237848 -0.11638881 -0.03653261 -0.36373847
 -0.39446366 -0.03841094 -0.14905209 -0.06209322 -0.27743236 -0.03015115
 -0.23095686 -0.42722668 -0.40587359 -0.05027571 -0.01976704 -0.03375037
 -0.2929221  -0.50310229 -0.24867892 -0.02946998 -0.02741194]


In [56]:
test_pred = model.predict(test_x)

In [57]:
test_pred

array([[8.5284926e-13],
       [2.9971219e-15],
       [2.7336865e-12],
       ...,
       [1.8116693e-17],
       [3.3041207e-17],
       [3.3947151e-18]], dtype=float32)

In [43]:
results = []
cnt = 1
for y in test_pred:
    results.append((cnt,y[0]))
    cnt += 1

In [44]:
def save_as_csv(result,path):
    headers = ['id','Predicted']
    with open(path, 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)
    print(path,'saved...')

In [45]:
save_as_csv(results,'./results/ann_13_features_prediction_dropout_1.csv')

./results/ann_13_features_prediction_dropout_1.csv saved...
