In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import os
import opennmt
import string
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output
%matplotlib inline

### Read Files

In [2]:
%%time
def read_text(filepath):
    return pd.read_csv(filepath, header=None, names=["content"], sep="\n")

def read_align(filepath):
    return pd.read_csv(filepath, header=None, names=["align"])

es_data = read_text("../../../cliang/train/pipeline_test_v3/tokenized_data/NFPA_CS_Train_BPE_applied.es")
en_data = read_text("../../../cliang/train/pipeline_test_v3/tokenized_data/NFPA_CS_Train_BPE_applied.en")
align_data = read_align("../../../cliang/train/pipeline_test_v3/align_data/NFPA_CS_Train_Align_Corpus.gdfa")
print("English File Sentences: ", es_data.shape[0])
print("Spanish File Sentences: ", en_data.shape[0])
print("Align File Rows: ", align_data.shape[0])

English File Sentences:  93859
Spanish File Sentences:  93859
Align File Rows:  93859
CPU times: user 684 ms, sys: 48 ms, total: 732 ms
Wall time: 1.01 s


### Tokenization

In [3]:
%%time
def onmt_tokenize(sentence, config = {"mode": "space"}):
    tokenizer = opennmt.tokenizers.opennmt_tokenizer.create_tokenizer(config)
    tokenized_sentence = tokenizer.tokenize(sentence)
    return tokenized_sentence[0]

align_data["align_idx"] = align_data["align"].str.split(" ").apply(lambda x: [tuple(term.split("-")) for term in x])
merge_data = pd.concat([en_data.rename(columns={"content":"en_content"}), es_data.rename(columns={"content":"es_content"}), align_data["align_idx"]], axis = 1)
merge_data["en_tokens"] = merge_data["en_content"].apply(lambda x: onmt_tokenize(x))
merge_data["es_tokens"] = merge_data["es_content"].apply(lambda x: onmt_tokenize(x))
merge_data.head()

CPU times: user 17.9 s, sys: 460 ms, total: 18.4 s
Wall time: 18.4 s


### Set up Merged dataframe

In [4]:
%%time
merge_data_align = merge_data.reset_index().rename(columns = {"index": "sentence_id"}).loc[:,["align_idx", "en_tokens", "es_tokens", "sentence_id"]]
merge_data_align["align_list"] = merge_data_align.apply(lambda x: [(x["en_tokens"][int(term[0])], x["es_tokens"][int(term[1])]) for term in x["align_idx"]], axis = 1)
merge_data_align["align_len"] = merge_data_align["align_idx"].apply(lambda x: len(x))

CPU times: user 59.9 s, sys: 180 ms, total: 1min
Wall time: 1min


In [5]:
merge_data_align.head()

Unnamed: 0,align_idx,en_tokens,es_tokens,sentence_id,align_list,align_len
0,"[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5...","[nfpa, and, national, fire, protection, associ...","[nfpa, y, national, fire, protection, associat...",0,"[(nfpa, nfpa), (and, y), (national, national),...",19
1,"[(0, 0), (1, 1), (2, 2), (3, 3), (3, 4), (4, 4...","[copyright, ©, 2011, national, fire, protectio...","[copyright, ©, 2011, national, fire, protectio...",1,"[(copyright, copyright), (©, ©), (2011, 2011),...",14
2,"[(0, 1), (1, 0), (1, 2), (2, 3), (3, 4), (4, 5...","[this, edition, of, nfpa, 1,, fire, code,, was...","[la, presente, edición, de, nfpa, 1,, código, ...",2,"[(this, presente), (edition, la), (edition, ed...",47
3,"[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5...","[this, edition, of, nfpa, 1, was, approved, as...","[esta, edición, de, nfpa, 1, se, aprobó, como,...",3,"[(this, esta), (edition, edición), (of, de), (...",21
4,"[(0, 9), (1, 10), (2, 11), (3, 12), (4, 13), (...","[a, tentative, interim, amendment, (ti@@, a), ...","[el, 11, de, agosto, de, 2011, se, emiti@@, ó,...",4,"[(a, una), (tentative, enmienda), (interim, in...",64


In [6]:
# merge_data_align.to_csv("/home/ubuntu/cliang/nfpa_eda/experiments/sample_data/align_train.csv")

#### Building index for each align items

In [7]:
merge_data_align["sentence_id_idx"] = merge_data_align.apply(lambda x: [x["sentence_id"]] * x["align_len"], axis = 1)
merge_data_align.head()

Unnamed: 0,align_idx,en_tokens,es_tokens,sentence_id,align_list,align_len,sentence_id_idx
0,"[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5...","[nfpa, and, national, fire, protection, associ...","[nfpa, y, national, fire, protection, associat...",0,"[(nfpa, nfpa), (and, y), (national, national),...",19,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[(0, 0), (1, 1), (2, 2), (3, 3), (3, 4), (4, 4...","[copyright, ©, 2011, national, fire, protectio...","[copyright, ©, 2011, national, fire, protectio...",1,"[(copyright, copyright), (©, ©), (2011, 2011),...",14,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,"[(0, 1), (1, 0), (1, 2), (2, 3), (3, 4), (4, 5...","[this, edition, of, nfpa, 1,, fire, code,, was...","[la, presente, edición, de, nfpa, 1,, código, ...",2,"[(this, presente), (edition, la), (edition, ed...",47,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,"[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5...","[this, edition, of, nfpa, 1, was, approved, as...","[esta, edición, de, nfpa, 1, se, aprobó, como,...",3,"[(this, esta), (edition, edición), (of, de), (...",21,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,"[(0, 9), (1, 10), (2, 11), (3, 12), (4, 13), (...","[a, tentative, interim, amendment, (ti@@, a), ...","[el, 11, de, agosto, de, 2011, se, emiti@@, ó,...",4,"[(a, una), (tentative, enmienda), (interim, in...",64,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


#### Extract every align items to re-create the align dataframe in suitable format

In [8]:
align_list = merge_data_align["align_list"].values.tolist()

In [9]:
align_df = pd.DataFrame(columns = ["en_token", "es_token"])
for sentence_id in range(len(align_list)):
    df = pd.DataFrame.from_records(align_list[sentence_id], columns = ["en_token", "es_token"])
    df["sentence_id"] = int(sentence_id)
    align_df = pd.concat([align_df, df], axis = 0)
    print("finished sentence {0} / {1} [{2:.3f}%]".format(sentence_id, align_data.shape[0], sentence_id / align_data.shape[0] * 100))
    clear_output(wait=True)

finished sentence 93858 / 93859 [99.999%]


#### Export the Alignment Dataframe

In [10]:
align_df.to_csv("/home/ubuntu/cliang/nfpa_eda/sample_data/align_df.csv")

In [16]:
align_df.reset_index().rename(columns = {"index": "in_sentence_id"}).head()

Unnamed: 0,in_sentence_id,en_token,es_token,sentence_id
0,0,nfpa,nfpa,0.0
1,1,and,y,0.0
2,2,national,national,0.0
3,3,fire,fire,0.0
4,4,protection,protection,0.0
