# Link to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Colab Notebooks'

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


# Data pre-processing

In [None]:
import re
import numpy as np

### Translation blocks are sparated by empty lines:

Block 1\n

\n

Block 2\n

\n

...

Block N\n
\n

In [None]:
# read the text file - 
train_file_path = "train.txt"
test_file_path = "test.txt"
blocks_train = open(train_file_path, "r").read().split("\n\n")
blocks_test = open(test_file_path, "r").read().split("\n\n")

# remove the trailing empty line at the end of each file
blocks_train[-1] = blocks_train[-1][:-1]
blocks_test[-1] = blocks_test[-1][:-1]

# check the last translation pair to make sure that there is no misreading
print(blocks_train[-1])
print('There are', len(blocks_train), 'training translation pairs.\n')
print(blocks_test[-1])
print('There are', len(blocks_test), 'testing translation pairs.')


发言人 表示 , 美国国会 众议院 国际 关系 委员会 主席 海德 先生 2 日 在 港 发表 的 演讲 , 对 中国 的 发展 与 进步 进行 了 恶意 攻击 , 并 以 冷战 思维 污蔑 中国 外交 政策 , 我们 表示 强烈 不满 .
the spokesman said the speech in hong kong on the 2 nd by mr. hyde , chairman of the us house of representatives ' international relations committee , was a malicious attack on china's development and progress as well as a slander on china's foreign policy evidencing a cold war mentality , with which we are strongly unsatisfied .
the spokesman claimed that china had expressed its strong displeasure with the speech delivered by mr. hyde , chairman of the u.s. house international relations committee , in hong kong on december 2 , which contained malicious attacks on china's development and progress , and tarnished china's foreign policy with his cold war mentality .
0.6332
H
There are 584 training translation pairs.

域 名 是 开展 电子 商务 , 电子 政务 等 一切 互 联 网 应用 的 通信 基础 , 目前 被 广泛 使用 作为 互 联 网 地址 .
domain names are the basis of communications on which all internet sites use for e

### Each translation block has 5 lines:
*   line 1: source (Chinese)
*   line 2: translation1 (English)
*   line 3: translation2 (English)
*   line 4: probablility? (float)
*   line 5: label (human/machine)

with one exception: the second English translation of the 83th block:

`although the arms embargo to the chinese 丨 shadow on the eu summit , but still in the two sides signed several trade and other agreements .`

has one Chinese character '丨' (U+4E28). I had to manually exclude it.

Also, I encode label 'H' as 0 and 'M' as 1.



In [None]:
# detects if texts have Chinese characters
def isChinese(texts):
  # exclude the '丨' character
  if re.search("[\u4e00-\u4e27\u4e29-\u9FFF]", texts):
      return True
  return False

# parse translation block strings
def parseBlocks(blocks):
  chinese = []
  reference = []
  english_translation = []
  probablilities = []
  labels_translation = []

  i = 0
  while i < len(blocks):
    lines = blocks[i].split('\n')
    # the first setence is Chinese
    chinese.append(lines[0])
    # the first Englisth translation is done human
    reference.append(lines[1])
    # the second English translation can be machine translation
    english_translation.append(lines[2])
    lines[4] = 0 if lines[4] == 'H' else 1
    labels_translation.append(lines[4])
    # append the quality score
    probablilities.append(float(lines[3])) 
    i += 1
  return chinese, english_translation, labels_translation, reference, probablilities

In [None]:
chinese_train,\
english_translation_train,\
labels_translation_train,\
reference_train,\
probablilities_train = parseBlocks(blocks_train)
labels_translation_train = np.array(labels_translation_train)
labels_reference_train = np.array([0] * len(reference_train))

# sanity check
print('******** Length check for the training info:')
print(len(chinese_train))
print(len(english_translation_train))
print(len(labels_translation_train))
print(len(reference_train))
print(len(probablilities_train), '\n')


print('******** Content check for the training info:')
print(blocks_train[99])
print(chinese_train[99])
print(english_translation_train[99])
print(labels_translation_train[99])
print(reference_train[99])
print(probablilities_train[99])

******** Length check for the training info:
584
584
584
584
584 

******** Content check for the training info:
奈 斯 说 , 计 画 怀孕 的 女性 仍 应 服用 叶酸 补品 .
ness says women planning to become pregnant should still take folic acid supplements .
ness said that women planning to become pregnant should continue to take folic acid supplements .
0.7500
H
奈 斯 说 , 计 画 怀孕 的 女性 仍 应 服用 叶酸 补品 .
ness said that women planning to become pregnant should continue to take folic acid supplements .
0
ness says women planning to become pregnant should still take folic acid supplements .
0.75


In [None]:
chinese_test,\
english_translation_test,\
labels_translation_test,\
reference_test,\
probablilities_test = parseBlocks(blocks_test)
labels_translation_test = np.array(labels_translation_test)
labels_reference_test = np.array([0] * len(reference_test))

# sanity check
print('******** Length check for the training info:')
print(len(chinese_test))
print(len(english_translation_test))
print(len(labels_translation_test))
print(len(reference_test))
print(len(probablilities_test), '\n')


print('******** Content check for the training info:')
print(blocks_test[99])
print(chinese_test[99])
print(english_translation_test[99])
print(labels_translation_test[99])
print(reference_test[99])
print(probablilities_test[99])

******** Length check for the training info:
174
174
174
174
174 

******** Content check for the training info:
国际 计划 1995 年 进入 中国 , 迄今 已 资助 儿童 21300 多 名 .
plan international entered china in 1995 and has helped over 213000 children so far .
plan international entered china in 1995 . to date , it has subsidized over 21300 children .
0.5882
H
国际 计划 1995 年 进入 中国 , 迄今 已 资助 儿童 21300 多 名 .
plan international entered china in 1995 . to date , it has subsidized over 21300 children .
0
plan international entered china in 1995 and has helped over 213000 children so far .
0.5882


## Word embedding

### I use SentenceBERT to transform sentences into vectors

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d83164149778a33b6bdd1a74e1bcb59b2b2cd1b861359b339/sentence-transformers-0.4.1.2.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.2MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 5.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 21.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     

In [None]:
from sentence_transformers import SentenceTransformer
import pickle

In [None]:
# choose model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:46<00:00, 8.75MB/s]


In [None]:
# # vectorize sentences
# english_translation_embeddings = sbert_model.encode(english_translation_train + english_translation_test)
# english_translation_embeddings_train = english_translation_embeddings[:len(english_translation_train)] # 
# english_translation_embeddings_test = english_translation_embeddings[len(english_translation_train):] # 

# # save the embeddings
path_translation_embeddings_train = 'translation_embeddings_train_bert.p'
path_translation_embeddings_test = 'translation_embeddings_test_bert.p'
# pickle.dump(english_translation_embeddings_train, open(path_translation_embeddings_train, "wb"))
# pickle.dump(english_translation_embeddings_test, open(path_translation_embeddings_test, "wb"))

# load the pickled embeddings
english_translation_embeddings_train = pickle.load(open(path_translation_embeddings_train, "rb"))
english_translation_embeddings_test = pickle.load(open(path_translation_embeddings_test, "rb"))

# sanity check
print(english_translation_embeddings_train.shape)
print(english_translation_embeddings_test.shape)

(584, 768)
(174, 768)


## Discriminator

In [None]:
import random
import keras
import sklearn
from keras.layers import Input, Dense, Flatten, Dropout, Reshape, LSTM, Bidirectional
from keras.layers import Embedding, Concatenate
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU,ReLU
from keras.optimizers import RMSprop, Adam
import keras.backend as K
from keras.models import Model, Sequential

Use a neuro-network.

In [None]:
sentence_input = Input(shape=(768,1), name='sentence_input')

droprate = 0.3
dimension = 128

x = Dense(dimension)(sentence_input)
x = Dropout(droprate)(x)
# x = LeakyReLU(alpha=0.01)(x)
x = ReLU()(x)

x = Bidirectional(LSTM(dimension, return_sequences=True, activation='tanh'))(x) 
x = Bidirectional(LSTM(dimension, return_sequences=False, activation='tanh'))(x)

x = Dropout(droprate)(x)
x = Dense(dimension)(x)
x = ReLU()(x)

output = Dense(1, activation='softmax')(x)
discriminator = Model(sentence_input ,output)
discriminator.summary()

optimizer = Adam(lr=0.001)
discriminator.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence_input (InputLayer)  [(None, 768, 1)]          0         
_________________________________________________________________
dense (Dense)                (None, 768, 128)          256       
_________________________________________________________________
dropout (Dropout)            (None, 768, 128)          0         
_________________________________________________________________
re_lu (ReLU)                 (None, 768, 128)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 768, 256)          263168    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0     

Train a model.

In [None]:
def train(batch_size, iterations, training_data, training_label, testing_data, testing_label, model):
  batch_number = len(training_data) // batch_size
  avg_f1_score = 0
  for epoch in range(iterations):
    for i in range(batch_number):
      indices = np.random.choice(training_data.shape[0], batch_size, replace=False)
      batch_data = training_data[indices]
      batch_label = training_label[indices]
      # print(data.shape)
      # print(label_batch.shape)
      loss = model.train_on_batch(batch_data,batch_label)
      if i % 5 == 0:
        print('epoch', epoch, 'batch', i)
        print('loss & accuracy', loss)
    # print out progress every 5 epochs
    if epoch % 5 == 0:
      print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
      print('epoch', epoch)
      indices = np.random.choice(training_data.shape[0], batch_size, replace=False)
      batch_data = training_data[indices]
      batch_label = training_label[indices]
      predoctions = model.predict_on_batch(batch_data)
      # calculate F1 score
      f1 = sklearn.metrics.f1_score(batch_label, predoctions)
      print('f1 score', f1)
      avg_f1_score = (avg_f1_score * epoch / 5 + f1) / (epoch / 5 + 1)
      print('average f1 score', avg_f1_score)

In [16]:
train(32, 
   100, 
   english_translation_embeddings_train, 
   labels_translation_train, 
   english_translation_embeddings_test, 
   labels_translation_test, 
   discriminator)


epoch 0 batch 0
loss & accuracy [0.6927883625030518, 0.53125]
epoch 0 batch 5
loss & accuracy [0.6972246766090393, 0.5]
epoch 0 batch 10
loss & accuracy [0.689956784248352, 0.4375]
epoch 0 batch 15
loss & accuracy [0.6913466453552246, 0.46875]
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
epoch 0
f1 score 0.6666666666666666
average f1 score 0.6666666666666666
epoch 1 batch 0
loss & accuracy [0.6990156769752502, 0.40625]
epoch 1 batch 5
loss & accuracy [0.6980094313621521, 0.5]
epoch 1 batch 10
loss & accuracy [0.6923913955688477, 0.46875]
epoch 1 batch 15
loss & accuracy [0.68578040599823, 0.34375]
epoch 2 batch 0
loss & accuracy [0.7101274728775024, 0.5625]
epoch 2 batch 5
loss & accuracy [0.6889208555221558, 0.46875]
epoch 2 batch 10
loss & accuracy [0.6946236491203308, 0.5]
epoch 2 batch 15
loss & accuracy [0.7349269390106201, 0.5]
epoch 3 batch 0
loss & accuracy [0.7498829364776611, 0.53125]
epoch 3 batch 5
loss & accuracy [0.6870465278625488, 0.46875]
epoch 3 batch

TypeError: ignored

In [20]:
discriminator.save('keras_model.h5')
discriminator = keras.models.load_model('keras_model.h5')

Evaluation

In [18]:
# predict on testing set and calculate the f1 score
predictions_test = discriminator.predict_on_batch(english_translation_embeddings_test)
print('The f1 score on the testing set is:')
print(sklearn.metrics.f1_score(labels_translation_test, predictions_test))

The f1 score on the testing set is:
0.640625
