# CNN

In [20]:
from keras.preprocessing import sequence
from keras.models import Model, Input
from keras.layers import Dense, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.layers import Dense, Dropout, Activation, BatchNormalization

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [21]:
train = pd.read_csv('preprocessed_train.csv', low_memory=False, encoding='latin1')
final_out=pd.read_csv('Devex_submission_format.csv', low_memory=False, encoding='latin1')
test = pd.read_csv("Devex_test_questions.csv",low_memory=False, encoding='latin1')

In [22]:
#clean_applied Text instead of using Text
X_train = train['Text'].fillna("fillna").values
y_train = train[['3.1.1', '3.1.2', '3.2.1',
       '3.2.2', '3.3.1', '3.3.2', '3.3.3', '3.3.4', '3.3.5', '3.4.1', '3.4.2',
       '3.5.1', '3.5.2', '3.6.1', '3.7.1', '3.7.2', '3.8.1', '3.8.2', '3.9.1',
       '3.9.2', '3.9.3', '3.a.1', '3.b.1', '3.b.2', '3.b.3', '3.c.1', '3.d.1']].astype(float).values #.astype(np.int64).values
X_test = test['Text'].fillna("fillna").values

In [23]:
max_features = 30000  # number of words we want to keep
maxlen = 400  # max length of the docs in the model
batch_size = 32  # batch size for the model
embedding_dims = 20  # dimension of the hidden variable, which is the embedding dimension

In [24]:
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train) + list(X_test))
x_train = tok.texts_to_sequences(X_train)
x_test = tok.texts_to_sequences(X_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

2995 train sequences
998 test sequences
Average train sequence length: 493
Average test sequence length: 597


In [25]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen,padding='post')
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (2995, 400)
x_test shape: (998, 400)


In [26]:
from sklearn.model_selection import train_test_split
#X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.7, random_state=233)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.7, random_state=233)



In [27]:
#binary_crossentropy
comment_input = Input((maxlen,))
from keras import regularizers

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen, 
                        embeddings_initializer="uniform", embeddings_regularizer=regularizers.l2(0.01))(comment_input)

# we add a GlobalMaxPooling1D, which will extract features from the embeddings
# of all words in the documents
h = GlobalMaxPooling1D()(comment_emb)

# We project onto a 27-unit output layer, and squash it with a sigmoid:
output = Dense(27, activation='sigmoid')(h)
BatchNormalization()
model_bin = Model(inputs=comment_input, outputs=output)
BatchNormalization()
#mod.add()

model_bin.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [28]:
hist = model_bin.fit(x_train, y_train, batch_size=batch_size, epochs=5, validation_split=0.1)

Train on 2695 samples, validate on 300 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
y_pred = model_bin.predict(X_val, batch_size=32)
y_pred[0].sum

<function ndarray.sum>

In [30]:
from sklearn.metrics import hamming_loss
def avg_loss(threshold, y_pred, y_val):
    loss = []
    final_test =  y_pred > threshold
    ham_y = final_test.astype(int)
    ham_true = y_val.astype(int)
    for i in range(len(y_pred)):
        loss.append(hamming_loss(ham_y[i], ham_true[i]))
    return np.mean(loss)

In [31]:
optimal_threshold ={}
for i in np.linspace(.3, .6, num=20):
    optimal_threshold[i]= avg_loss(i, y_pred, y_val)
optimal_threshold

{0.3: 0.07320891525563382,
 0.3157894736842105: 0.07189057800848679,
 0.33157894736842103: 0.07156099369670002,
 0.34736842105263155: 0.07123140938491326,
 0.3631578947368421: 0.07114901330696657,
 0.3789473684210526: 0.07114901330696657,
 0.39473684210526316: 0.07114901330696657,
 0.4105263157894737: 0.07102541919004655,
 0.4263157894736842: 0.07102541919004655,
 0.4421052631578947: 0.07147859761875332,
 0.45789473684210524: 0.07151979565772669,
 0.4736842105263158: 0.07156099369670003,
 0.4894736842105263: 0.07139620154080667,
 0.5052631578947369: 0.07139620154080667,
 0.5210526315789474: 0.07139620154080667,
 0.5368421052631579: 0.07139620154080667,
 0.5526315789473684: 0.07139620154080667,
 0.5684210526315789: 0.07139620154080667,
 0.5842105263157895: 0.07139620154080667,
 0.6: 0.07139620154080667}

In [32]:
threshold = min(optimal_threshold.items(), key=lambda x: x[1]) 
threshold

(0.4105263157894737, 0.07102541919004655)

In [33]:
y_pred_submit = model_bin.predict(x_test)
