In [2]:
# Link to colab drive & import packages
from google.colab import drive
drive.mount('/content/drive/')

dir_path = '/content/drive/MyDrive/Project/'

%cd /content/drive/MyDrive/Project/

import os
import json
import copy
import codecs
import numpy as np
from tqdm import tqdm
import pickle
# from utils.global_variables import Global

Mounted at /content/drive/
/content/drive/MyDrive/Project


In [None]:
class LevenReader(object):
    def __init__(self):
        self.data = []
        self.raw_dir = "./raw"
        self.data_dir = "./data"
        self.flag_dir = ['Data', 'Data_Crf']
        self.word2vec_source_file = 'sgns.baidubaike.bigram-char'
        self.word2vec_file = "word2vec.npy"
        self.modes = ["train", "valid"]
        self.embedding_dict = self.load_embedding_dict(os.path.join(self.raw_dir, self.word2vec_source_file))
        self.vocab_size = 0
        self.embedding_size = 0

    def read(self, crf=0):
        """
        :param mode: train/valid/test
        :return: [{"tokens": list(int), "labels": list(int)}, ...]
        """
        for mode in self.modes:
          self.data.clear()
          if not os.path.exists(os.path.join(self.data_dir, self.flag_dir[crf], 'flag')):
              os.makedirs(os.path.join(self.data_dir, self.flag_dir[crf]))
              self.preprocess(crf)
          with open(os.path.join(self.data_dir, self.flag_dir[crf], "{}_processed.json".format(mode)), "r+", encoding="utf-8") as f:
              data = json.load(f)
          if crf==0 and mode=='train':
            Global_dict = {}
            Global_dict["word2id"] = data["word2id"]
            Global_dict["id2word"] = data["id2word"]
            Global_dict["label2id"] = data["label2id"]
            Global_dict["id2label"] = data["id2label"]
            with open(os.path.join(self.data_dir, self.flag_dir[crf], f'Global_dict_crf_{crf}.pkl'), 'wb') as file:
              pickle.dump(Global_dict, file)
          if crf==1 and mode=='train':
            Global_dict = {}
            Global_dict["word2id"] = data["word2id"]
            Global_dict["id2word"] = data["id2word"]
            Global_dict["label2id"] = data["label2id"]
            Global_dict["id2label"] = data["id2label"]
            Global_dict["type2id"] = data["type2id"]
            with open(os.path.join(self.data_dir, self.flag_dir[crf], f'Global_dict_crf_{crf}.pkl'), 'wb') as file:
              pickle.dump(Global_dict, file)

          for item in data["info"]:
              tokens = [data["word2id"][x] if x in data["word2id"] else data["word2id"]["<UNK>"] for x in item["tokens"]]
              if mode != "test":
                  labels = [data["label2id"][x] for x in item["labels"]]
              canids = item["canids"]
              docids = item["docids"]
              if crf==0:
                  for i in range(len(canids)):
                      if item["flags"][i]:
                          if mode != "test":
                              temp = {"tokens": tokens,
                                      "labels": labels[i],
                                      "canids": canids[i],
                                      "docids": docids,
                                      "index": i}
                          else:
                              temp = {"tokens": tokens,
                                      "canids": canids[i],
                                      "docids": docids,
                                      "index": i}
                          self.data.append(temp)
              else:
                  if mode != "test":
                      temp = {"tokens": tokens,
                              "labels": labels,
                              "canids": canids,
                              "docids": docids,
                              "flags": item["flags"]}
                  else:
                      temp = {"tokens": tokens,
                              "canids": canids,
                              "docids": docids,
                              "flags": item["flags"]}
                  self.data.append(temp)

          if mode=='train':
            config_runtime_dict = {}
            config_runtime_dict["vocab_size"] = str(self.vocab_size)
            config_runtime_dict["embedding_size"] = str(self.embedding_size)
            config_runtime_dict["num_class"] = str(len(data["label2id"]))
            config_runtime_dict["sequence_length"] = str(data["sequence_length"])
            with open(os.path.join(self.data_dir, self.flag_dir[crf], f'config_runtime_dict_crf_{crf}.pkl'), 'wb') as file:
              pickle.dump(config_runtime_dict, file)


          print("Mode: {} | Dataset Size = {}".format(mode, len(self.data)))
          with open(os.path.join(self.data_dir, self.flag_dir[crf], f'{mode}_data.pkl'), 'wb') as file:
            pickle.dump(copy.deepcopy(self.data), file)

    def preprocess(self, crf=0):
        """
        :return: output file, integrated data and word vector matrix
         Integrate data formats：{
            "info":[{"tokens": list(str), "labels": list(str), "flags": list(bool)}, ...],
            "word2id": {"<PAD>": 0, "<UNK>": 1},
            "id2word": {0: "<PAD>", 1: "<UNK>"},
            "label2id": {"None": 0},
            "id2label": {0: "None"},
            "sequence_length": int
        }
        """

        processed_data = {"info_train": [],
                                          "info_valid": [],
                                          "info_test": [],
                                          "word2id": {},
                                          "id2word": {},
                                          "label2id": {},
                                          "id2label": {},
                                          "sequence_length": 0}

        if crf==1:
            processed_data["label2id"]["O"] = 0
            processed_data["id2label"][0] = "O"
            processed_data["type2id"] = {"O": 0}
        else:
            processed_data["label2id"]["None"] = 0
            processed_data["id2label"][0] = "None"

        for mode in self.modes:
            with codecs.open(os.path.join(self.raw_dir, "{}.jsonl".format(mode)), 'r', encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()
                for line in lines:
                    line = line.rstrip()
                    doc = json.loads(line)
                    docids = doc["id"]
                    doc_tokens, doc_labels, doc_canids, doc_flags = [], [], [], []
                    for item in doc["content"]:
                        doc_tokens.append(item["tokens"])

                    if crf==1:
                        for tokens in doc_tokens:
                            if mode != "test":
                                doc_labels.append(["O"] * len(tokens))
                            doc_canids.append([""] * len(tokens))
                            doc_flags.append([0] * len(tokens))

                        if mode == "test":
                            for candi in doc["candidates"]:
                                for i in range(candi["offset"][0], candi["offset"][1]):
                                    doc_canids[candi["sent_id"]][i] = candi["id"]
                                    doc_flags[candi["sent_id"]][i] = 1
                        else:
                            for event in doc["events"]:
                                tp = event["type"].replace("-", "_")
                                if tp not in processed_data["type2id"]:
                                    processed_data["type2id"][tp] = event["type_id"]
                                for mention in event["mention"]:
                                    for i in range(mention["offset"][0], mention["offset"][1]):
                                        doc_labels[mention["sent_id"]][i] = ("B-" + tp) if (i == mention["offset"][0]) else ("I-" + tp)
                                        doc_canids[mention["sent_id"]][i] = mention["id"]
                                        doc_flags[mention["sent_id"]][i] = 1

                    else:
                        for tokens in doc_tokens:
                            if mode != "test":
                                doc_labels.append(["None"] * len(tokens))
                            doc_canids.append([""] * len(tokens))
                            doc_flags.append([0] * len(tokens))
                            processed_data["sequence_length"] = max(processed_data["sequence_length"], len(tokens))

                        if mode == "test":
                            for candi in doc["candidates"]:
                                for i in range(candi["offset"][0], candi["offset"][1]):
                                    doc_canids[candi["sent_id"]][i] = candi["id"]
                                    doc_flags[candi["sent_id"]][i] = 1
                        else:
                            for event in doc["events"]:
                                if event["type"] not in processed_data["label2id"]:
                                    processed_data["label2id"][event["type"]] = event["type_id"]
                                    processed_data["id2label"][event["type_id"]] = event["type"]
                                for mention in event["mention"]:
                                    for i in range(mention["offset"][0], mention["offset"][1]):
                                        doc_labels[mention["sent_id"]][i] = event["type"]
                                        doc_canids[mention["sent_id"]][i] = mention["id"]
                                        doc_flags[mention["sent_id"]][i] = 1

                    if mode != "test":
                        for mention in doc["negative_triggers"]:
                            for i in range(mention["offset"][0], mention["offset"][1]):
                                doc_canids[mention["sent_id"]][i] = mention["id"]
                                doc_flags[mention["sent_id"]][i] = 1

                        for tokens, labels, canids, flags in zip(doc_tokens, doc_labels, doc_canids, doc_flags):
                            processed_data["info_{}".format(mode)].append({"tokens": tokens,
                                                                           "labels": labels,
                                                                           "canids": canids,
                                                                           "flags": flags,
                                                                           "docids": docids})
                            if crf==1:
                                for label in labels:
                                    if label not in processed_data["label2id"]:
                                        id = len(processed_data["label2id"])
                                        processed_data["label2id"][label] = id
                                        processed_data["id2label"][id] = label
                    else:
                        for tokens, canids, flags in zip(doc_tokens, doc_canids, doc_flags):
                            processed_data["info_{}".format(mode)].append({"tokens": tokens,
                                                                           "canids": canids,
                                                                           "flags": flags,
                                                                           "docids": docids})

        if crf==1:
            processed_data["sequence_length"] = 512

        word2vec_mat = []
        for (k, v) in tqdm(self.embedding_dict.items(), desc='reading pretrained word embeddings'):
            id = len(processed_data["word2id"])
            processed_data["word2id"][k] = id
            processed_data["id2word"][id] = k
            word2vec_mat.append(v)

        word2vec_mat = np.array(word2vec_mat, dtype=np.float32)
        self.vocab_size = word2vec_mat.shape[0]
        self.embedding_size = word2vec_mat.shape[1]
        if not os.path.exists(os.path.join(self.data_dir, self.word2vec_file)):
            np.save(os.path.join(self.data_dir, self.word2vec_file), word2vec_mat)

        for mode in self.modes:
            with open(os.path.join(self.data_dir, self.flag_dir[crf], "{}_processed.json".format(mode)), "w", encoding="utf-8") as f:
                temp_data = {"info": processed_data["info_{}".format(mode)],
                                          "word2id": processed_data["word2id"],
                                          "id2word": processed_data["id2word"],
                                          "label2id": processed_data["label2id"],
                                          "id2label": processed_data["id2label"],
                                          "sequence_length": processed_data["sequence_length"]}
                if crf==1:
                    temp_data["type2id"] = processed_data["type2id"]
                json.dump(temp_data, f, indent=2, ensure_ascii=False)

        with open(os.path.join(self.data_dir, self.flag_dir[crf], 'flag'), "w+") as f:
            f.write("")

    @staticmethod
    def load_embedding_dict(path):
        lines = open(path, encoding='utf-8').readlines()
        embedding_dict = {}
        for i, line in enumerate(lines):
            if i == 0 and '\n' in line:
                continue

            if '\n' in line:
                line = line[:-2]    # remove the '[blank]\n' in the end of the string

            split = line.split(" ")
            embedding_dict[split[0]] = np.array(list(map(float, split[1:])))

        unk = sum(list(embedding_dict.values())) / len(embedding_dict.keys())
        embedding_dict['<UNK>'] = unk
        embedding_dict['<PAD>'] = np.random.randn(unk.shape[0])
        return embedding_dict

In [None]:
reader= LevenReader()

In [None]:
reader.read(crf=0)

reading pretrained word embeddings: 100%|██████████| 635976/635976 [00:00<00:00, 971759.77it/s]


Mode: train | Dataset Size = 395322
Mode: valid | Dataset Size = 92451


In [None]:
reader.read(crf=1)

reading pretrained word embeddings: 100%|██████████| 635976/635976 [00:00<00:00, 966625.20it/s]


Mode: train | Dataset Size = 41238
Mode: valid | Dataset Size = 9788


In [None]:
# get the lable counts
def get_labels_dict(path):
  with open(path, 'rb') as file:
    loaded_data = pickle.load(file)
  frequency_dict = {}
  if 'Crf' not in path:
    for data in loaded_data:
        label = data['labels']
        if label in frequency_dict:
            frequency_dict[label] += 1
        else:
            frequency_dict[label] = 1
  else:
      for data in loaded_data:
        labels = data['labels']
        for label in labels:
          if label in frequency_dict:
              frequency_dict[label] += 1
          else:
              frequency_dict[label] = 1
  sorted_dict = dict(sorted(frequency_dict.items()))
  df = pd.DataFrame.from_dict(sorted_dict,orient='index',columns=['count'])
  df.to_csv(path[:-4]+'_label_distribution.csv')


data_path_train = './data/Data/train_data.pkl'
data_path_valid = './data/Data/valid_data.pkl'
datacrf_path_train = './data/Data_Crf/train_data.pkl'
datacrf_path_valid = './data/Data_Crf/valid_data.pkl'

get_labels_dict(data_path_train)
get_labels_dict(data_path_valid)
get_labels_dict(datacrf_path_train)
get_labels_dict(datacrf_path_valid)

In [None]:
# get sentence lengths distribution
import pickle
import pandas as pd
with open(  './data/Data_Crf/train_data.pkl', 'rb') as file:
  loaded_data = pickle.load(file)
  sentence_lengths = [len(data['tokens']) for data in loaded_data]
  df = pd.DataFrame(sentence_lengths, columns=["lengths"])
  df.to_csv("./data/sentence_lengths.csv", index=False)