In [1]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)
folder = "drive/MyDrive/AIMS_HW3/"

Mounted at /content/drive/


In [2]:
!python --version
!pip install sklearn-crfsuite
!pip freeze > drive/MyDrive/AIMS_HW3/requirements.txt

Python 3.8.16
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 14.3 MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [3]:
import os
import sys
import csv
import numpy as np
import pandas as pd
import sklearn_crfsuite
from tqdm import tqdm
from tabulate import tabulate
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [4]:
file_path = f"{folder}/sample_data.txt"

with open(file_path, "r", encoding="utf8") as f:
    file_text = f.read().encode("utf-8").decode("utf-8-sig")

datas = file_text.split("\n\n--------------------\n\n")[:-1]
len(datas)

with open(f"{folder}/processed_data.txt", "w") as f:
    for article_id, data in enumerate(datas):
        data=data.split("\n")
        content=data[0]
        annotations=data[1:]
        row = list()
        for annot in annotations[1:]:
            annot=annot.split("\t") # annot = article_id, start_pos, end_pos, entity_text, entity_type
            row.append(annot)

        df = pd.DataFrame(row, columns=data[1].split("\t"))
        position_cols = ["start_position", "end_position"]
        df[position_cols] = df[position_cols].astype("int")

        tmp_label_list = np.array(["O"] * len(content), dtype=object)
        for i in range(len(df)):
            start, end, etype = df["start_position"][i], df["end_position"][i], df["entity_type"][i]
            tmp_label_list[start] = "B-" + str(etype)
            tmp_label_list[start+1:end] = "I-" + str(etype)

        for i, row in enumerate(zip(list(content), tmp_label_list)):
            f.write(" ".join(row) + "\n")
        
        f.write("\n")

with open(f"{folder}/processed_data.txt", "r") as in_file:
    stripped = (line.strip() for line in in_file)
    lines = (line.split(" ") for line in stripped if line)
    with open(f"{folder}/processed_data.csv", "w",encoding="utf-8") as out_file:
        writer = csv.writer(out_file)
        writer.writerow(("Text", "Label"))
        writer.writerows(lines)

sentence_id = float(0)
with open(f"{folder}/processed_data.csv", "r") as in_file:
    lines = []
    for line in in_file:
        line = line.strip().split(",")
        line.append( str(sentence_id) )
        if ("。" in line) or ("？" in line) or ("！" in line):
            sentence_id += float(1)
        lines.append(line)
    lines.pop(0)
    # write sentence_idx to csv
    with open(f"{folder}/processed_data.csv", "w+",encoding="utf-8") as out_file:
        writer = csv.writer(out_file)
        writer.writerow(("Text", "Label","sentence_idx"))
        writer.writerows(lines)

# file_path = f"{folder}/processed_data.csv"
# df = pd.read_csv(file_path, encoding = "utf8")
# df = df.reindex(columns=["sentence_idx","Text","Label"])
# data = df[["sentence_idx","Text","Label"]]
# data = df[df["sentence_idx"].notnull()]
# data.tail()

In [5]:
def CRF(x_train, y_train, x_test, y_test):
    # Doc: https://sklearn-crfsuite.readthedocs.io/en/latest/api.html#module-sklearn_crfsuite
    crf = sklearn_crfsuite.CRF(algorithm="lbfgs", c1=0.01, c2=0.1, max_iterations=300, min_freq=3, all_possible_transitions=True, verbose=2)
    crf.fit(x_train, y_train)

    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    labels = list(crf.classes_)
    labels.remove("O")
    f1score = metrics.flat_f1_score(y_test, y_pred, average="weighted", labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results

    return y_pred, y_pred_mar, f1score

In [6]:
# Load pretrained word vectors
# Get a dict of tokens (key) and their pretrained word vectors (value)
# Pre-trained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
# Pre-trained fastText word embeddings: https://fasttext.cc/docs/en/crawl-vectors.html

dim = 0
word_vecs= {}
# Open pretrained word vector file
# with open(f"{folder}/cna.cbow.cwe_p.tar_g.512d.0.txt") as f:
with open(f"{folder}/cc.zh.300.vec") as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0] 
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [7]:
print(f"vocabulary_size: {len(word_vecs)}")
print(f"word_vector_dim: {vec.shape}")

vocabulary_size: 2000000
word_vector_dim: (300,)


In [8]:
from sklearn.model_selection import train_test_split


def make_dataset(data_path):
    with open(data_path, "r", encoding="utf-8") as f:
        data = f.readlines() #.encode("utf-8").decode("utf-8-sig")
    data_list, data_list_tmp = list(), list()
    article_id_list = list()
    idx = 0
    for row in data:
        data_tuple = tuple()
        if row == "\n":
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip("\n").split(" ")
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # Here we random split data into training dataset and testing dataset
    # But you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # nd generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,article_id_list,test_size=0.33,random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [9]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def build_word_vectors(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector = np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
        
    return embedding_list

In [11]:
# Input features: pretrained word vectors of each token
# Return a list of feature dicts, each feature dict corresponding to each token
def make_features(embed_list):
    feature_list = list()
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            feature_dict = dict()
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict["dim_" + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]
            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)

    return feature_list

In [12]:
# Get the labels of each tokens in train.data
# Return a list of lists of labels
def process_labels(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
        
    return label_list

## Training

In [13]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = make_dataset(f"{folder}/processed_data.txt")

In [14]:
# Load Word Embedding
trainembed_list = build_word_vectors(traindata_list, word_vecs)
testembed_list = build_word_vectors(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = make_features(trainembed_list)
y_train = process_labels(traindata_list)

# CRF - Test Data (Golden Standard)
x_test = make_features(testembed_list)
y_test = process_labels(testdata_list)

In [15]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

loading training data to CRFsuite: 100%|██████████| 17/17 [00:03<00:00,  5.22it/s]



Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 639
Seconds required: 0.691

L-BFGS optimization
c1: 0.010000
c2: 0.100000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.36  loss=36160.39 active=639   feature_norm=1.00
Iter 2   time=0.19  loss=17409.24 active=635   feature_norm=14.41
Iter 3   time=0.18  loss=16698.49 active=538   feature_norm=13.93
Iter 4   time=1.52  loss=6498.85  active=417   feature_norm=6.25
Iter 5   time=0.37  loss=5025.94  active=416   feature_norm=6.32
Iter 6   time=0.36  loss=2579.79  active=568   feature_norm=7.24
Iter 7   time=0.89  loss=2359.17  active=623   feature_norm=7.17
Iter 8   time=0.62  loss=2329.00  active=625   feature_norm=6.73
Iter 9   time=0.64  loss=1972.55  active=630   feature_norm=7.09
Iter 10  time=

In [16]:
f1score

0.6216282465488363

In [17]:
output=[]
header = ["article_id","start_position","end_position","entity_text","entity_type"]
for test_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[test_id])):
        line=[]
        if y_pred[test_id][pred_id][0]=="B":
            start_pos=pos
            entity_type=y_pred[test_id][pred_id][2:]
        elif start_pos is not None and y_pred[test_id][pred_id][0]=="I" and y_pred[test_id][pred_id+1][0]=="O":
            end_pos=pos
            entity_text="".join([testdata_list[test_id][position][0] for position in range(start_pos,end_pos+1)])
            line.append(str(testdata_article_id_list[test_id]))
            line.append(str(start_pos))
            line.append(str(end_pos+1))
            line.append(str(entity_text))
            line.append(str(entity_type))
            output.append(line)
        pos+=1

output_path=f"{folder}/output.tsv"
with open(output_path,"w",encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(output)


print(tabulate(output, headers=header, tablefmt="psql"))

+--------------+------------------+----------------+----------------+---------------+
|   article_id |   start_position |   end_position | entity_text    | entity_type   |
|--------------+------------------+----------------+----------------+---------------|
|            8 |               10 |             12 | 38             | med_exam      |
|            8 |              189 |            193 | 二十分鐘       | time          |
|            8 |              293 |            295 | 五年           | time          |
|            8 |              519 |            521 | 吩咐           | time          |
|            8 |              540 |            544 | 兩個禮拜       | time          |
|            8 |              858 |            862 | 前天下午       | time          |
|            8 |             1354 |           1356 | 娜美           | name          |
|            8 |             1549 |           1551 | 五天           | time          |
|            8 |             1622 |           1627 | 五天禮拜三     | time     