In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold,StratifiedKFold
import re



In [2]:
import keras

Using Theano backend.


In [4]:
data_path = '/data/kaggleData/2sigma/'
store = data_path+'nn/'

train_df=pd.read_json(data_path+'normalized_train.json')
test_df=pd.read_json(data_path+'normalized_test.json')

In [5]:
features = list(test_df.columns)

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras import optimizers
from keras.callbacks import EarlyStopping

In [7]:
def nn_model(features,num_classes=3,lr=0.1):
    model = Sequential()
    model.add(Dense(64,  
                    activation='softplus',
                    input_shape = (len(features),),
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=l2(0.000025)
                                  ))
    model.add(Dropout(0.2))
    
    model.add(Dense(16,
                    activation='softplus', 
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(0.000025)
                    ))
    model.add(Dropout(0.1))

    model.add(Dense(units=num_classes, 
                    activation='softmax', 
                    kernel_initializer='he_normal',
                    ))
    opt = optimizers.Adadelta(lr=1)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=opt,
                  metrics=['accuracy']
                  )
    return(model)

In [8]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=StratifiedKFold(train_y,5,shuffle=True,random_state = 2333)

In [9]:
for batch_size in [64]:

    i=0
    cv_scores=[]
    cv_result=[]
    for dev_index, val_index in KF:
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
    
        early_stopping = EarlyStopping(monitor='val_loss', patience=20)

        seed = 0
        np.random.seed(seed)
        model = nn_model(features,lr=0.1)
        history=model.fit(dev_X, dev_y, epochs = 40, batch_size=batch_size, verbose = 2 
          #,validation_data=[val_X, val_y], callbacks=[early_stopping]
                         )
        preds =  model.predict_proba(val_X)
    
            #save the pickles for futures use
        pickl_file = store+'nn-5fold-out-'+str(i)+'.pickle'
        fileObject = open(pickl_file,'wb') 
        pickle.dump(preds,fileObject)   
        fileObject.close()    
        
        lls=log_loss(val_y, preds)
        cv_scores.append(lls)
        cv_result.append(history)
        i+=1
        print 'the cv_score for the '+str(i)+' turn is:'
        print(lls)
    
    print np.mean(cv_scores)

Epoch 1/40
4s - loss: 0.7134 - acc: 0.6990
Epoch 2/40
4s - loss: 0.6346 - acc: 0.7194
Epoch 3/40
4s - loss: 0.6253 - acc: 0.7216
Epoch 4/40
4s - loss: 0.6189 - acc: 0.7250
Epoch 5/40
4s - loss: 0.6157 - acc: 0.7267
Epoch 6/40
4s - loss: 0.6124 - acc: 0.7277
Epoch 7/40
4s - loss: 0.6102 - acc: 0.7300
Epoch 8/40
4s - loss: 0.6081 - acc: 0.7293
Epoch 9/40
4s - loss: 0.6061 - acc: 0.7323
Epoch 10/40
4s - loss: 0.6015 - acc: 0.7329
Epoch 11/40
4s - loss: 0.6038 - acc: 0.7333
Epoch 12/40
4s - loss: 0.6005 - acc: 0.7349
Epoch 13/40
4s - loss: 0.5992 - acc: 0.7363
Epoch 14/40
4s - loss: 0.5978 - acc: 0.7350
Epoch 15/40
4s - loss: 0.5955 - acc: 0.7374
Epoch 16/40
4s - loss: 0.5955 - acc: 0.7375
Epoch 17/40
4s - loss: 0.5929 - acc: 0.7385
Epoch 18/40
4s - loss: 0.5922 - acc: 0.7379
Epoch 19/40
4s - loss: 0.5917 - acc: 0.7392
Epoch 20/40
5s - loss: 0.5912 - acc: 0.7379
Epoch 21/40
4s - loss: 0.5904 - acc: 0.7376
Epoch 22/40
4s - loss: 0.5888 - acc: 0.7405
Epoch 23/40
3s - loss: 0.5879 - acc: 0.74

In [10]:
"""trainX testX for et and rf """
train_X, test_X = train_df[features].as_matrix(), test_df[features].as_matrix()

seed = 0
np.random.seed(seed)
model = nn_model(features,lr=0.1)
history=model.fit(train_X, train_y, epochs = 40, batch_size=64, verbose = 2) 
  #,validation_data=[val_X, val_y])#, callbacks=[early_stopping])

preds =  model.predict_proba(test_X)

out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_json(store+'nn-bulk-out.json')

Epoch 1/40
6s - loss: 0.6994 - acc: 0.7010
Epoch 2/40
4s - loss: 0.6329 - acc: 0.7182
Epoch 3/40
4s - loss: 0.6251 - acc: 0.7214
Epoch 4/40
5s - loss: 0.6190 - acc: 0.7238
Epoch 5/40
5s - loss: 0.6163 - acc: 0.7250
Epoch 6/40
5s - loss: 0.6110 - acc: 0.7278
Epoch 7/40
6s - loss: 0.6092 - acc: 0.7299
Epoch 8/40
5s - loss: 0.6060 - acc: 0.7316
Epoch 9/40
5s - loss: 0.6043 - acc: 0.7299
Epoch 10/40
6s - loss: 0.6025 - acc: 0.7335
Epoch 11/40
5s - loss: 0.6008 - acc: 0.7334
Epoch 12/40
5s - loss: 0.6003 - acc: 0.7347
Epoch 13/40
6s - loss: 0.5980 - acc: 0.7353
Epoch 14/40
5s - loss: 0.5962 - acc: 0.7374
Epoch 15/40
6s - loss: 0.5956 - acc: 0.7367
Epoch 16/40
5s - loss: 0.5949 - acc: 0.7358
Epoch 17/40
5s - loss: 0.5936 - acc: 0.7355
Epoch 18/40
6s - loss: 0.5933 - acc: 0.7376
Epoch 19/40
5s - loss: 0.5932 - acc: 0.7372
Epoch 20/40
5s - loss: 0.5905 - acc: 0.7385
Epoch 21/40
5s - loss: 0.5895 - acc: 0.7393
Epoch 22/40
5s - loss: 0.5905 - acc: 0.7377
Epoch 23/40
5s - loss: 0.5879 - acc: 0.73