In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.models import Model

Using TensorFlow backend.


In [2]:
train_data_source = './eng/eng/trac2_eng_train.csv'
test_data_source = './eng/eng/trac2_eng_dev.csv'

train_df = pd.read_csv(train_data_source,)
train_df.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C45.451,Next part,NAG,NGEN
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN


In [3]:
test_df = pd.read_csv(test_data_source)
test_df.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C7.2589,U deserve more subscribers. U really great.,NAG,NGEN
1,C68.872,Nice video....,NAG,NGEN
2,C36.762,sorry if i bother somebody.. iam a defence asp...,NAG,GEN
3,C4.1540.1,Joker was amazing....it was not glamorised !.....,NAG,NGEN
4,C59.68,Nice baro,NAG,NGEN


In [4]:
# convert string to lower case
train_texts = train_df['Text'].values
train_texts = [s.lower() for s in train_texts]

test_texts = test_df['Text'].values
test_texts = [s.lower() for s in test_texts]

In [5]:
train_df['length'] = list(map(lambda x: len(x), train_df['Text']))
train_df['length'].describe()

count    4263.000000
mean       97.965752
std       187.789251
min         3.000000
25%        20.000000
50%        44.000000
75%       104.000000
max      4377.000000
Name: length, dtype: float64

In [6]:
len(train_df[train_df['length']> 150])/len(train_df)

0.1653764954257565

In [7]:


# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
# -----------------------Skip part end----------------------------

# Convert string to index
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

In [8]:

# Padding
train_data = pad_sequences(train_sequences, maxlen=150, padding='post')
test_data = pad_sequences(test_texts, maxlen=150, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

# =======================Get classes================
train_df['Sub-task A']= pd.Categorical(train_df['Sub-task A'])
train_df['agg_class'] = train_df['Sub-task A'].cat.codes
#train_class_list = [x - 1 for x in train_classes]

test_df['Sub-task A'] = pd.Categorical(test_df['Sub-task A'])
test_df['agg_class'] = test_df['Sub-task A'].cat.codes
#test_class_list = [x - 1 for x in test_classes]

from keras.utils import to_categorical
#Y = pd.get_dummies(train_data['Sub-task A']).values
train_classes = to_categorical(train_df['agg_class'])
test_classes = to_categorical(test_df['agg_class'])

In [9]:
test_df['agg_class'].value_counts()

1    836
0    117
2    113
Name: agg_class, dtype: int64

In [10]:
input_size = 150
vocab_size = len(tk.word_index)
embedding_size = 100
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 3
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

In [11]:


# Embedding weights
embedding_weights = []  # (70, 69)
embedding_weights.append(np.zeros(vocab_size))  # (0, 69)

for char, i in tk.word_index.items():  # from index 1 to 69
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print('Load')

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_size,
                            input_length=input_size,
                            #weights=[embedding_weights]
                           )

# Model Construction
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding
x = embedding_layer(inputs)
# Conv
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Adam, categorical_crossentropy
model.summary()

# Shuffle


Load





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          7000      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 144, 256)          179456    
_________________________________________________________________
activation_1 (Activation)    (None, 144, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 48, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 42, 256)           459008    
_______________

In [23]:
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices]
y_train = train_classes[indices]

x_test = test_data
y_test = test_classes

# Training
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=1)


Train on 4263 samples, validate on 1066 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2ab93673748>

In [24]:
to_categorical(y_pred_bool)

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [25]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y_test, to_categorical(y_pred_bool)))

              precision    recall  f1-score   support

           0       0.24      0.24      0.24       117
           1       0.84      0.87      0.85       836
           2       0.28      0.19      0.22       113

   micro avg       0.73      0.73      0.73      1066
   macro avg       0.45      0.43      0.44      1066
weighted avg       0.71      0.73      0.72      1066
 samples avg       0.73      0.73      0.73      1066

