In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Add
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import gc
import math
from sklearn import metrics

In [18]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open('./wiki.ru.vec'))

In [19]:
train = pd.read_csv('train.csv')

In [20]:
train['t'] = 0
train['d'] = 0

In [21]:
kf = KFold(n_splits=2, random_state=2018)
idtotal = kf.split(train)

In [22]:
max_features = 100000
maxlen_t = 15
maxlen_d = 60
embed_size = 300
filter_sizes = [1,2,3,4]
num_filters = 32
batch_size = 256
epochs = 3

In [23]:
def data_cleaning(ids):
    t_train = train.loc[ids[0]]
    t_test = train.loc[ids[1]]
    t_train = t_train.reset_index(drop = True)
    t_test = t_test.reset_index(drop = True)
    x_input_t = t_train['title'].fillna('fillna').values
    x_input_d = t_train['description'].astype(str).fillna('fillna').values
    test_t = t_test['title'].fillna('fillna').values
    test_d = t_test['description'].astype(str).fillna('fillna').values
    y_test = t_test['deal_probability'].values
    y_train = t_train['deal_probability'].values
    y_train = np.array(pd.concat([pd.Series(y_train),pd.Series(1-y_train)],axis=1))
    
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(x_input_t) + list(test_t))
    tokenizer.fit_on_texts(list(x_input_d) + list(test_d))
    X_train_t = tokenizer.texts_to_sequences(x_input_t)
    X_test_t = tokenizer.texts_to_sequences(test_t)
    X_train_d = tokenizer.texts_to_sequences(x_input_d)
    X_test_d = tokenizer.texts_to_sequences(test_d)
    x_train_t = sequence.pad_sequences(X_train_t, maxlen=maxlen_t)
    x_test_t = sequence.pad_sequences(X_test_t, maxlen=maxlen_t)
    x_train_d = sequence.pad_sequences(X_train_d, maxlen=maxlen_d)
    x_test_d = sequence.pad_sequences(X_test_d, maxlen=maxlen_d)
    return {'tokenizer': tokenizer, 'x_train_t': x_train_t, 'x_test_t': x_test_t, 'x_train_d': x_train_d,
           'x_test_d': x_test_d, 'y_train': y_train, 'y_test': y_test }

In [24]:
def CNN(dic):
    word_index = dic['tokenizer'].word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    
    inp_t = Input(shape=(maxlen_t, ))
    inp_d = Input(shape=(maxlen_d, ))
    x_t = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_t)
    x_d = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_d)
    x_t = SpatialDropout1D(0.3)(x_t)
    x_d = SpatialDropout1D(0.3)(x_d)
    x_t = Reshape((maxlen_t, embed_size, 1))(x_t)
    x_d = Reshape((maxlen_d, embed_size, 1))(x_d)
    
    conv_0_t = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_1_t = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_2_t = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    conv_3_t = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x_t)
    #conv_4_t = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x_t)

    conv_0_d = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_1_d = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_2_d = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    conv_3_d = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    #conv_4_d = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x_d)
    
    maxpool_0_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[0] + 1, 1))(conv_0_t)
    maxpool_1_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[1] + 1, 1))(conv_1_t)
    maxpool_2_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[2] + 1, 1))(conv_2_t)
    maxpool_3_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[3] + 1, 1))(conv_3_t)
    #maxpool_4_t = MaxPool2D(pool_size=(maxlen_t - filter_sizes[4] + 1, 1))(conv_4_t) 
    
    maxpool_0_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[0] + 1, 1))(conv_0_d)
    maxpool_1_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[1] + 1, 1))(conv_1_d)
    maxpool_2_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[2] + 1, 1))(conv_2_d)
    maxpool_3_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[3] + 1, 1))(conv_3_d)
    #maxpool_4_d = MaxPool2D(pool_size=(maxlen_d - filter_sizes[4] + 1, 1))(conv_4_d)  
    
    z_t = Concatenate(axis=1)([maxpool_0_t, maxpool_1_t, maxpool_2_t, maxpool_3_t])#, maxpool_4_t])   
    z_d = Concatenate(axis=1)([maxpool_0_d, maxpool_1_d, maxpool_2_d, maxpool_3_d])#, maxpool_4_d])
    z_t = Flatten()(z_t)
    z_t = Dropout(0.2)(z_t)
    z_d = Flatten()(z_d)
    z_d = Dropout(0.2)(z_d)
    
    out_t = Dense(2, activation="softmax")(z_t)
    out_d = Dense(2, activation="softmax")(z_d)
    
    model_t = Model(inputs=[inp_t], outputs=out_t)
    model_d = Model(inputs=[inp_d], outputs=out_d)
    model_t.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    model_d.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    
    return {'t': model_t, 'd': model_d}

In [None]:
k = 0
lst_t = []
lst_d = []
for ids in idtotal:
    k += 1
    print('Fold ' + str(k) + ' Start!')
    
    print('Data Cleaning Start!')
    a = data_cleaning(ids)
    
    print('CNN Construction Start!')
    models = CNN(a)
    
    print('Title Trainning Start!')
    hist_t = models['t'].fit(a['x_train_t'], a['y_train'], batch_size=batch_size, epochs=3, verbose=2)
    test_t_pred = models['t'].predict(a['x_test_t'], batch_size=1024)
    train.at[ids[1],'t'] = test_t_pred[:,0]
    print('Title RMSE: %f' % math.sqrt(metrics.mean_squared_error(a['y_test'], test_t_pred[:,0])))
    
    print('Description Trainning Start!')
    hist_d = models['d'].fit(a['x_train_d'], a['y_train'], batch_size=batch_size, epochs=3, verbose=2)
    test_d_pred = models['d'].predict(a['x_test_d'], batch_size=1024)
    train.at[ids[1],'d'] = test_d_pred[:,0]
    print('Description RMSE: %f' % math.sqrt(metrics.mean_squared_error(a['y_test'], test_d_pred[:,0])))
    
print('Finish!')

Fold 1 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 2417s - loss: 0.0580 - mean_squared_error: 0.0580
Epoch 2/3


In [14]:
train.to_csv('trainforuse2.csv',index=False)

In [None]:
Fold 1 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 2524s - loss: 0.0572 - mean_squared_error: 0.0572
Epoch 2/3
 - 8825s - loss: 0.0545 - mean_squared_error: 0.0545
Epoch 3/3
 - 2313s - loss: 0.0532 - mean_squared_error: 0.0532
Description Trainning Start!
Epoch 1/3
 - 4245s - loss: 0.0581 - mean_squared_error: 0.0581
Epoch 2/3
 - 4131s - loss: 0.0545 - mean_squared_error: 0.0545
Epoch 3/3
 - 11996s - loss: 0.0521 - mean_squared_error: 0.0521
Fold 2 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 5926s - loss: 0.0577 - mean_squared_error: 0.0577
Epoch 2/3
 - 4620s - loss: 0.0545 - mean_squared_error: 0.0545
Epoch 3/3
 - 3662s - loss: 0.0532 - mean_squared_error: 0.0532
Description Trainning Start!
Epoch 1/3
 - 7157s - loss: 0.0594 - mean_squared_error: 0.0594
Epoch 2/3
 - 7127s - loss: 0.0545 - mean_squared_error: 0.0545
Epoch 3/3
 - 6648s - loss: 0.0521 - mean_squared_error: 0.0521
Fold 3 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 3665s - loss: 0.0575 - mean_squared_error: 0.0575
Epoch 2/3
 - 5162s - loss: 0.0543 - mean_squared_error: 0.0543
Epoch 3/3
 - 8013s - loss: 0.0530 - mean_squared_error: 0.0530
Description Trainning Start!
Epoch 1/3
 - 20662s - loss: 0.0600 - mean_squared_error: 0.0600
Epoch 2/3
 - 4869s - loss: 0.0543 - mean_squared_error: 0.0543
Epoch 3/3
 - 6203s - loss: 0.0519 - mean_squared_error: 0.0519
Finish!

In [None]:
Fold 1 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 4620s - loss: 0.0581 - mean_squared_error: 0.0581
Epoch 2/3
 - 3155s - loss: 0.0547 - mean_squared_error: 0.0547
Epoch 3/3
 - 3323s - loss: 0.0534 - mean_squared_error: 0.0534
Title RMSE: 0.234442
Description Trainning Start!
Epoch 1/3
 - 6731s - loss: 0.0580 - mean_squared_error: 0.0580
Epoch 2/3
 - 4412s - loss: 0.0546 - mean_squared_error: 0.0546
Epoch 3/3
 - 4420s - loss: 0.0524 - mean_squared_error: 0.0524
Description RMSE: 0.236288
Fold 2 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 2436s - loss: 0.0578 - mean_squared_error: 0.0578
Epoch 2/3
 - 2414s - loss: 0.0546 - mean_squared_error: 0.0546
Epoch 3/3
 - 2281s - loss: 0.0534 - mean_squared_error: 0.0534
Title RMSE: 0.235876
Description Trainning Start!
Epoch 1/3
 - 4878s - loss: 0.0600 - mean_squared_error: 0.0600
Epoch 2/3
 - 4345s - loss: 0.0547 - mean_squared_error: 0.0547
Epoch 3/3
 - 9811s - loss: 0.0526 - mean_squared_error: 0.0526
Description RMSE: 0.235445
Fold 3 Start!
Data Cleaning Start!
CNN Construction Start!
Title Trainning Start!
Epoch 1/3
 - 3731s - loss: 0.0575 - mean_squared_error: 0.0575
Epoch 2/3
 - 3685s - loss: 0.0545 - mean_squared_error: 0.0545
Epoch 3/3
 - 3645s - loss: 0.0533 - mean_squared_error: 0.0533
Title RMSE: 0.235326
Description Trainning Start!
Epoch 1/3
 - 7152s - loss: 0.0579 - mean_squared_error: 0.0579
Epoch 2/3
 - 6924s - loss: 0.0544 - mean_squared_error: 0.0544
Epoch 3/3
 - 6815s - loss: 0.0523 - mean_squared_error: 0.0523
Description RMSE: 0.236635
Finish!