In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Add
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from sklearn.model_selection import KFold
import gc
import math
from sklearn import metrics

Using TensorFlow backend.


In [2]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open('./wiki.ru.vec'))

In [56]:
train = pd.read_csv('train.csv')
y_train = train['deal_probability']

In [57]:
test = pd.read_csv('test.csv')

In [48]:
train['t'] = 0
train['d'] = 0
test['t'] = 0
test['d'] = 0

In [58]:
train['mix'] = 0
test['mix'] = 0

In [61]:
kf = KFold(n_splits=3, random_state=2018)
idtotal = kf.split(train)

In [50]:
max_features = 100000
maxlen_t = 15
maxlen_d = 60
embed_size = 300
filter_sizes = [1,2,3,4]
num_filters = 32
batch_size = 256
epochs = 3

In [51]:
tokenizer = text.Tokenizer(num_words=max_features)
input_t = train['title'].fillna('fillna').values
input_d = train['description'].astype(str).fillna('fillna').values
test_t = test['title'].fillna('fillna').values
test_d = test['description'].astype(str).fillna('fillna').values
tokenizer.fit_on_texts(list(input_t) + list(test_t))
tokenizer.fit_on_texts(list(input_d) + list(test_d))
input_t = tokenizer.texts_to_sequences(input_t)
test_t = tokenizer.texts_to_sequences(test_t)
input_d = tokenizer.texts_to_sequences(input_d)
test_d = tokenizer.texts_to_sequences(test_d)
input_t = sequence.pad_sequences(input_t, maxlen=maxlen_t)
test_t = sequence.pad_sequences(test_t, maxlen=maxlen_t)
input_d = sequence.pad_sequences(input_d, maxlen=maxlen_d)
test_d = sequence.pad_sequences(test_d, maxlen=maxlen_d)

In [52]:
def data_cleaning(ids):
    
    t_train = input_t[ids[0]]
    t_test = input_t[ids[1]]
    d_train = input_d[ids[0]]
    d_test = input_d[ids[1]]
    y_train_ = y_train.loc[ids[0]]
    y_train_ = np.array(pd.concat([pd.Series(y_train_),pd.Series(1-y_train_)],axis=1))
    y_test_ = y_train.loc[ids[1]]
        
    return {'x_train_t': t_train, 'x_test_t': t_test, 'x_train_d': d_train,
           'x_test_d': d_test, 'y_train': y_train_, 'y_test': y_test_ }

In [53]:
def CNN(dic):
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    
    inp_t = Input(shape=(maxlen_t, ))
    inp_d = Input(shape=(maxlen_d, ))
    x_t = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_t)
    x_d = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_d)
    x_t = SpatialDropout1D(0.3)(x_t)
    x_d = SpatialDropout1D(0.3)(x_d)
    x_t = Reshape((maxlen_t, embed_size, 1))(x_t)
    x_d = Reshape((maxlen_d, embed_size, 1))(x_d)
    
    conv_0_t = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_1_t = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_2_t = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_3_t = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    #conv_4_t = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x_t)

    conv_0_d = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_1_d = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_2_d = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_3_d = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    #conv_4_d = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    
    maxpool_0_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[0] + 1, 1))(conv_0_t)
    maxpool_1_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[1] + 1, 1))(conv_1_t)
    maxpool_2_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[2] + 1, 1))(conv_2_t)
    maxpool_3_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[3] + 1, 1))(conv_3_t)
    #maxpool_4_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[4] + 1, 1))(conv_4_t) 
    
    maxpool_0_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[0] + 1, 1))(conv_0_d)
    maxpool_1_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[1] + 1, 1))(conv_1_d)
    maxpool_2_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[2] + 1, 1))(conv_2_d)
    maxpool_3_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[3] + 1, 1))(conv_3_d)
    #maxpool_4_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[4] + 1, 1))(conv_4_d)  
    
    z_t = Concatenate(axis=1)([maxpool_0_t, maxpool_1_t, maxpool_2_t, maxpool_3_t])#, maxpool_4_t])   
    z_d = Concatenate(axis=1)([maxpool_0_d, maxpool_1_d, maxpool_2_d, maxpool_3_d])#, maxpool_4_d])
    z_t = Flatten()(z_t)
    z_t = Dropout(0.2)(z_t)
    z_d = Flatten()(z_d)
    z_d = Dropout(0.2)(z_d)
    
    out_t = Dense(2, activation="softmax")(z_t)
    out_d = Dense(2, activation="softmax")(z_d)
    
    model_t = Model(inputs=[inp_t], outputs=out_t)
    model_d = Model(inputs=[inp_d], outputs=out_d)
    model_t.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    model_d.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    
    return {'t': model_t, 'd': model_d}

In [60]:
def CNN(dic):
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    
    inp_t = Input(shape=(maxlen_t, ))
    inp_d = Input(shape=(maxlen_d, ))
    x_t = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_t)
    x_d = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_d)
    x_t = SpatialDropout1D(0.3)(x_t)
    x_d = SpatialDropout1D(0.3)(x_d)
    x_t = Reshape((maxlen_t, embed_size, 1))(x_t)
    x_d = Reshape((maxlen_d, embed_size, 1))(x_d)
    
    conv_0_t = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_1_t = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_2_t = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_3_t = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    #conv_4_t = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x_t)

    conv_0_d = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_1_d = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_2_d = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_3_d = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    #conv_4_d = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    
    maxpool_0_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[0] + 1, 1))(conv_0_t)
    maxpool_1_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[1] + 1, 1))(conv_1_t)
    maxpool_2_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[2] + 1, 1))(conv_2_t)
    maxpool_3_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[3] + 1, 1))(conv_3_t)
    #maxpool_4_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[4] + 1, 1))(conv_4_t) 
    
    maxpool_0_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[0] + 1, 1))(conv_0_d)
    maxpool_1_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[1] + 1, 1))(conv_1_d)
    maxpool_2_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[2] + 1, 1))(conv_2_d)
    maxpool_3_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[3] + 1, 1))(conv_3_d)
    #maxpool_4_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[4] + 1, 1))(conv_4_d)  
    
    z = Concatenate(axis=1)([maxpool_0_t, maxpool_1_t, maxpool_2_t, maxpool_3_t, maxpool_0_d, maxpool_1_d, maxpool_2_d, maxpool_3_d])#, maxpool_4_t])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
    
    out = Dense(2, activation="softmax")(z)
    
    model = Model(inputs=[inp_t,inp_d], outputs=out)
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    
    return model

In [None]:
k = 0
for ids in idtotal:
    k += 1
    print('Fold ' + str(k) + ' Start!')
    
    print('Data Cleaning Start!')
    a = data_cleaning(ids)
    
    print('CNN Construction Start!')
    models = CNN(a)
    
    print('Title Trainning Start!')
    hist = models.fit([a['x_train_t'],a['x_train_d']], a['y_train'], batch_size=batch_size, epochs=3, verbose=2)
    valid_pred = models.predict([a['x_test_t'], a['x_test_d']], batch_size=1024)
    train.at[ids[1],'mix'] = valid_pred[:,0]
    test_pred = pd.Series(models.predict([test_t, test_d], batch_size=1024)[:,0])
    if k == 1:
        result = test_pred
    else:
        result += test_pred
     
    print('Total RMSE: %f' % math.sqrt(metrics.mean_squared_error(a['y_test'], valid_pred[:,0])))
    
result = result/3

print('Finish!')

Fold 1 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 5931s - loss: 0.0558 - mean_squared_error: 0.0558
Epoch 2/3
 - 41075s - loss: 0.0518 - mean_squared_error: 0.0518
Epoch 3/3


In [38]:
k = 0
for ids in idtotal:
    k += 1
    print('Fold ' + str(k) + ' Start!')
    
    print('Data Cleaning Start!')
    a = data_cleaning(ids)
    
    print('CNN Construction Start!')
    models = CNN(a)
    
    print('Title Trainning Start!')
    hist_t = models['t'].fit(a['x_train_t'], a['y_train'], batch_size=batch_size, epochs=3, verbose=2)
    valid_t_pred = models['t'].predict(a['x_test_t'], batch_size=1024)
    train.at[ids[1],'t'] = valid_t_pred[:,0]
    test_t_pred = pd.Series(models['t'].predict(test_t, batch_size=1024)[:,0])
    if k == 1:
        result_t = test_t_pred
    else:
        result_t += test_t_pred
     
    print('Title RMSE: %f' % math.sqrt(metrics.mean_squared_error(a['y_test'], valid_t_pred[:,0])))
    
    print('Description Trainning Start!')
    hist_d = models['d'].fit(a['x_train_d'], a['y_train'], batch_size=batch_size, epochs=3, verbose=2)
    valid_d_pred = models['d'].predict(a['x_test_d'], batch_size=1024)
    train.at[ids[1],'d'] = valid_d_pred[:,0]
    test_d_pred = pd.Series(models['d'].predict(test_d, batch_size=1024)[:,0])
    if k == 1:
        result_d = test_d_pred
    else:
        result_d += test_d_pred
        
    print('Description RMSE: %f' % math.sqrt(metrics.mean_squared_error(a['y_test'], valid_d_pred[:,0])))

result_t = result_t/3
result_d = result_d/3

print('Finish!')

Fold 1 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 8239s - loss: 0.0577 - mean_squared_error: 0.0577
Epoch 2/3
 - 2344s - loss: 0.0546 - mean_squared_error: 0.0546
Epoch 3/3
 - 2362s - loss: 0.0534 - mean_squared_error: 0.0534
Title RMSE: 0.234528
Description Trainning Start!
Epoch 1/3
 - 4343s - loss: 0.0594 - mean_squared_error: 0.0594
Epoch 2/3
 - 4503s - loss: 0.0547 - mean_squared_error: 0.0547
Epoch 3/3
 - 40787s - loss: 0.0525 - mean_squared_error: 0.0525
Description RMSE: 0.235712
Fold 2 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 2479s - loss: 0.0581 - mean_squared_error: 0.0581
Epoch 2/3
 - 2274s - loss: 0.0546 - mean_squared_error: 0.0546
Epoch 3/3
 - 4719s - loss: 0.0534 - mean_squared_error: 0.0534
Title RMSE: 0.234534
Description Trainning Start!
Epoch 1/3
 - 4095s - loss: 0.0580 - mean_squared_error: 0.0580
Epoch 2/3
 - 4104s - loss: 0.0546 - mean_squared_error: 0.0546
Epoch 3/3
 - 70

In [43]:
train.to_csv('trainforuse.csv',index=False)

In [44]:
test['t'] = result_t
test['d'] = result_d

In [45]:
test.to_csv('testforuse.csv',index=False)