In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [1]:
from zipfile import ZipFile
import pandas as pd
import numpy as np

In [2]:
import keras
from keras.layers import Dense, Dropout, Flatten, Input, LSTM, Embedding, Reshape
from keras import optimizers
from keras.models import Model, Sequential
from keras.layers.merge import Concatenate

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
with ZipFile('../data/train.csv.zip') as z:
    with z.open('train.csv') as f:
        train_df = pd.read_csv(f)
y = train_df['deal_probability'].copy()

In [4]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [5]:
joint_df = pd.concat([train_df, test_df])

In [6]:
del train_df, test_df

In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, Imputer

In [8]:
#joint_df['region_city'] = joint_df['region'] + '_' + joint_df['city']
#del joint_df['region'], joint_df['city']
joint_df['no_image'] = pd.isnull(joint_df['image'])

In [9]:
joint_df['category_name'] = joint_df['category_name'].astype(str)
joint_df['city'] = joint_df['city'].astype(str)
joint_df['region'] = joint_df['region'].astype(str)
joint_df['param_1'] = joint_df['param_1'].astype(str)
joint_df['parent_category_name'] = joint_df['parent_category_name'].astype(str)
joint_df['user_type'] = joint_df['user_type'].astype(str)

In [10]:
def generate_features(df):  
    cat_feature_names = ['category_name', 'region', 'city', 'param_1', 'parent_category_name', 'user_type', 'image_top_1']

    cat_features = []
    cat_encoders = []
    for name in cat_feature_names:
        encoder = LabelEncoder()
        #imputer = Imputer()
        #feature = imputer.fit_transform(df[name])
        cat_features.append(encoder.fit_transform(df[name]).reshape(-1, 1))
        cat_encoders.append(encoder)
    
    num_feature_names = ['price', 'item_seq_number', 'no_image']
    num_features = []
    num_encoders = []
    for name in num_feature_names:
        scaler = StandardScaler()
        imputer = Imputer()
        feature = imputer.fit_transform(df[name].values.reshape(-1, 1))
        num_features.append(scaler.fit_transform(feature))
        num_encoders.append(scaler)
    return cat_features, cat_encoders, num_features, num_encoders        

In [11]:
cat_features, cat_encoders, num_features, num_encoders = generate_features(joint_df)

In [12]:
num_features = np.hstack(num_features)

In [13]:
num_features.shape

(2011862, 3)

In [14]:
train_len = len(y)

In [15]:
train_cat_features = [x[:train_len] for x in cat_features]

In [16]:
test_cat_features = [x[train_len:] for x in cat_features]

In [17]:
train_num_features = num_features[:train_len]

In [18]:
test_num_features = num_features[train_len:]

In [19]:
train_features = train_cat_features + [train_num_features]

In [20]:
test_features = test_cat_features + [test_num_features]

In [21]:
def cat_model(encoder):
    vocab_size = len(encoder.classes_) + 1
    embed_size = min(np.ceil((vocab_size) / 2), 7)
    embed_size = int(embed_size)
    inp = Input(shape=(1, ))
    x = Embedding(vocab_size, embed_size)(inp)
    x = Reshape((embed_size,))(x)
    x = Dense(20, activation='relu')(x)
    return inp, x

In [22]:
def cat_models(cat_encoders):
    inputs = []
    outputs = []
    for encoder in cat_encoders:
        inp, x = cat_model(encoder)
        inputs.append(inp)
        outputs.append(x)
    return inputs, outputs


In [23]:
def num_model(n_features):
    inp = Input(shape=(n_features,))
    x = Dense(20, activation='relu')(inp)
    return inp, x

In [24]:
cat_inputs, cat_outputs = cat_models(cat_encoders)

In [25]:
num_inputs, num_outputs = num_model(num_features.shape[1])

In [26]:
inputs = cat_inputs + [num_inputs]
outputs = cat_outputs + [num_outputs]

## Concatenation

In [27]:
merge = Concatenate()(outputs)

In [28]:
res = Dense(100, activation='relu')(merge)
res = Dropout(0.5)(res)
#res = Dense(75, activation='relu')(res)
#res = Dropout(0.5)(res)
res = Dense(50, activation='relu')(res)
res = Dropout(0.5)(res)
res = Dense(1, activation='relu')(res)

In [29]:
model = Model(inputs, res)

In [30]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [36]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['mean_squared_error'])

In [37]:
batch_size = 1024
epochs = 10

In [38]:
from keras.callbacks import LearningRateScheduler

In [39]:
learning_rate = LearningRateScheduler(lambda x: 0.001 / (1 + x * 5), verbose=1)

In [40]:
model.fit(train_features, y, batch_size, epochs, validation_split=0.1)#, callbacks=[learning_rate])

Train on 1353081 samples, validate on 150343 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [None]:
batch_size = 512
epochs = 50
model.fit(train_features, y, batch_size, epochs, initial_epoch=10, validation_split=0.1)

In [51]:
from features import load_features

In [53]:
from scipy import sparse

In [85]:
_, cat, _ = load_features('train', ['categorical_one_hot'])

In [86]:
cat = sparse.hstack(cat)

In [87]:
cat = cat.tocsr()

In [72]:
scaler = StandardScaler(with_mean=False)

In [73]:
imputer = Imputer()

In [74]:
cat = imputer.fit_transform(cat)

In [75]:
cat = scaler.fit_transform(cat, )

In [88]:
cat.shape

(1503424, 5601)

In [89]:
inp = Input(shape=(5601,))
x = Dense(400, activation='relu')(inp)
x = Dropout(0.5)(x)
x = Dense(200, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)

In [90]:
model_onehot = Model(inp, x)

In [91]:
model_onehot.compile(optimizer='adam', loss='mean_squared_error')

In [92]:
batch_size = 512
epochs = 10
model_onehot.fit(cat, y, batch_size, epochs, validation_split=0.1)

Train on 1353081 samples, validate on 150343 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cf0aa6dc50>

In [93]:
_, cat_test, _ = load_features('test', ['categorical_one_hot'])

In [94]:
cat_test = sparse.hstack(cat_test)

In [95]:
from submission import create_submission

In [96]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [97]:
create_submission(model_onehot.predict(cat_test), '../submissions/', 'nn_on_onehot_cat', test_df)

## Network 1 for categorical and numerical features

In [3]:
inp1 = Input(shape=(2540,))
x1 = Dense(100, activation='relu')(inp1)
x1 = Dense(30, activation='relu')(x1)

## Network 2

In [4]:
inp2 = Input(shape=(20,))
x2 = Dense(10, activation='relu')(inp2)
x2 = Dense(50, activation='relu')(x2)

## Concatenation

In [5]:
merge = Concatenate()([x1, x2])

In [6]:
output = Dense(1, activation='sigmoid')(merge)

In [7]:
model = Model([inp1, inp2], [output])

## Data generator

In [8]:
from numpy.random import normal

In [9]:
def useless_generator(features, labels, batch_size):
    while 1:
        inp1 = normal(size=(batch_size, 100))
        inp2 = normal(size=(batch_size, 20))
        output = normal(size=batch_size)
        yield [inp1, inp2], output
            

In [14]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 100)          10100       input_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 10)           210         input_2[0][0]                    
__________________________________________________________________________________________________
dense_2 (D

In [None]:
model.fit_generator(useless_generator(None, None, 64), steps_per_epoch=100000, epochs=10)