In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.models import Model, load_model
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import os

Using TensorFlow backend.


In [2]:
lang = 'eng'
task = 'Sub-task B'
if lang == 'ben':
    train_data_source = './iben/iben/trac2_iben_train_transliterated.csv'
    test_data_source = './iben/iben/trac2_iben_dev_transliterated.csv'
elif lang == 'hin':
    train_data_source = './hin/hin/trac2_hin_train_transliterated.csv'
    test_data_source = './hin/hin/trac2_hin_dev_transliterated.csv'
elif lang == 'eng':
    train_data_source = './eng/eng/trac2_eng_train.csv'
    test_data_source = './eng/eng/trac2_eng_dev.csv'
train_df = pd.read_csv(train_data_source,)
train_df.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C45.451,Next part,NAG,NGEN
1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN
2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN
3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN
4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN


In [3]:
test_df = pd.read_csv(test_data_source)
test_df.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C7.2589,U deserve more subscribers. U really great.,NAG,NGEN
1,C68.872,Nice video....,NAG,NGEN
2,C36.762,sorry if i bother somebody.. iam a defence asp...,NAG,GEN
3,C4.1540.1,Joker was amazing....it was not glamorised !.....,NAG,NGEN
4,C59.68,Nice baro,NAG,NGEN


In [4]:
# convert string to lower case
if lang == 'eng':
    train_texts = train_df['Text'].values 
    test_texts = test_df['Text'].values
else:
    train_texts = train_df['transliterated'].values
    test_texts = test_df['transliterated'].values

train_texts = [s.lower() for s in train_texts]
test_texts = [s.lower() for s in test_texts]

In [5]:
train_df['length'] = list(map(lambda x: len(x), train_df['Text']))
train_df['length'].describe()

count    4263.000000
mean       97.965752
std       187.789251
min         3.000000
25%        20.000000
50%        44.000000
75%       104.000000
max      4377.000000
Name: length, dtype: float64

In [6]:
len(train_df[train_df['length']> 150])/len(train_df)

0.1653764954257565

In [7]:
train_df[task].value_counts()

NGEN    3954
GEN      309
Name: Sub-task B, dtype: int64

In [8]:


# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
# -----------------------Skip part end----------------------------

# Convert string to index
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

In [9]:

# Padding
train_data = pad_sequences(train_sequences, maxlen=150, padding='post')
test_data = pad_sequences(test_texts, maxlen=150, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

# =======================Get classes================
train_df[task]= pd.Categorical(train_df[task])
train_df['target_class'] = train_df[task].cat.codes
#train_class_list = [x - 1 for x in train_classes]

test_df[task] = pd.Categorical(test_df[task])
test_df['target_class'] = test_df[task].cat.codes
#test_class_list = [x - 1 for x in test_classes]

from keras.utils import to_categorical
#Y = pd.get_dummies(train_data['Sub-task A']).values
train_classes = to_categorical(train_df['target_class'])
test_classes = to_categorical(test_df['target_class'])

In [10]:
test_df['target_class'].value_counts()

1    993
0     73
Name: target_class, dtype: int64

In [11]:
input_size = 150
vocab_size = len(tk.word_index)
embedding_size = 100
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = len(train_df['target_class'].value_counts())
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

In [12]:


# Embedding weights
embedding_weights = []  # (70, 69)
embedding_weights.append(np.zeros(vocab_size))  # (0, 69)

for char, i in tk.word_index.items():  # from index 1 to 69
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print('Load')

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_size,
                            input_length=input_size,
                            #weights=[embedding_weights]
                           )

# Model Construction
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding
x = embedding_layer(inputs)
# Conv
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Adam, categorical_crossentropy
model.summary()

# Shuffle


Load





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          7000      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 144, 256)          179456    
_________________________________________________________________
activation_1 (Activation)    (None, 144, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 48, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 42, 256)           459008    
_______________

In [13]:
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices]
y_train = train_classes[indices]

x_test = test_data
y_test = test_classes

In [14]:
# Training
epochs = 30
model_file = lang +'_trans_' + task + '.h5'
max_f1 = 0
if os.path.exists(model_file):
    model = load_model(model_file)
    y_pred = model.predict(x_test, batch_size=64, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)    
    print(classification_report(test_df['target_class'], y_pred_bool))
    max_f1 = f1_score(test_df['target_class'], y_pred_bool, average = 'weighted')

In [15]:
for i in range(epochs):
    print(i)
    model.fit(x_train, y_train,
              validation_data=(x_test, y_test),
              batch_size=128,
              epochs=1,
              verbose=1)
    y_pred = model.predict(x_test, batch_size=64, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)    
    print(classification_report(test_df['target_class'], y_pred_bool))
    f1s = classification_report(test_df['target_class'], y_pred_bool, output_dict=True)['weighted avg']['f1-score']
    if f1s > max_f1:
        print('saved at epoch ', i +1, ' with f1 ', f1s)
        model.save(model_file)
        max_f1 = f1s

0
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 4263 samples, validate on 1066 samples
Epoch 1/1





              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.93      1.00      0.96       993

   micro avg       0.93      0.93      0.93      1066
   macro avg       0.47      0.50      0.48      1066
weighted avg       0.87      0.93      0.90      1066

saved at epoch  1  with f1  0.8984935035587139


  'precision', 'predicted', average, warn_for)


1
Train on 4263 samples, validate on 1066 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.93      1.00      0.96       993

   micro avg       0.93      0.93      0.93      1066
   macro avg       0.47      0.50      0.48      1066
weighted avg       0.87      0.93      0.90      1066

2
Train on 4263 samples, validate on 1066 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.93      1.00      0.96       993

   micro avg       0.93      0.93      0.93      1066
   macro avg       0.47      0.50      0.48      1066
weighted avg       0.87      0.93      0.90      1066

3
Train on 4263 samples, validate on 1066 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.93      1.00      0.96       993

  

              precision    recall  f1-score   support

           0       0.36      0.18      0.24        73
           1       0.94      0.98      0.96       993

   micro avg       0.92      0.92      0.92      1066
   macro avg       0.65      0.58      0.60      1066
weighted avg       0.90      0.92      0.91      1066

16
Train on 4263 samples, validate on 1066 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.38      0.38      0.38        73
           1       0.95      0.95      0.95       993

   micro avg       0.92      0.92      0.92      1066
   macro avg       0.67      0.67      0.67      1066
weighted avg       0.92      0.92      0.92      1066

saved at epoch  17  with f1  0.9155722326454033
17
Train on 4263 samples, validate on 1066 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.44      0.10      0.16        73
           1       0.94      0.99      0.96       993

   micro avg

              precision    recall  f1-score   support

           0       0.88      0.10      0.17        73
           1       0.94      1.00      0.97       993

   micro avg       0.94      0.94      0.94      1066
   macro avg       0.91      0.55      0.57      1066
weighted avg       0.93      0.94      0.91      1066



In [16]:
model = load_model(model_file)
y_pred = model.predict(x_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)    
print(classification_report(test_df['target_class'], y_pred_bool))

              precision    recall  f1-score   support

           0       0.57      0.27      0.37        73
           1       0.95      0.98      0.97       993

   micro avg       0.94      0.94      0.94      1066
   macro avg       0.76      0.63      0.67      1066
weighted avg       0.92      0.94      0.93      1066



In [17]:
#hin task a 66 task b 88
#ben task a 69 task b 86
#eng task a 75 task b 93