# AI Lab 2 
# Part 0: Preparation
In this part, we preprocess the data such that we can utilize it better.
Some useful libraries is also loaed.

In [1]:
%load_ext autoreload
%autoreload 2

from Code.transformer_crf import TRANSFORMER_CRF
from Code.hmm import HMM
from Code.tools import better_organize
from Code.crf import word2features,sent2features,sent2labels,sent2tokens,read_data
import time,json,torch,tqdm
import pycrfsuite
import torch.optim as optim
from NER.check import check

First we want to reorganize the given txt to jsonl files.

In [2]:
better_organize(path="NER/Chinese/train.txt",terminal_path="Chinese-train.jsonl")
better_organize(path="NER/English/train.txt",terminal_path="English-train.jsonl")
better_organize(path="NER/Chinese/validation.txt",terminal_path="Chinese-validation.jsonl")
better_organize(path="NER/English/validation.txt",terminal_path="English-validation.jsonl")

# Part 1: Hidden Markov Model
In this part, we inplemented the hidden markov model. Utilizing it, we accomplish the NER mission. The detailed code can be find in `hmm.py`

In [3]:
h = HMM(path="English-train.jsonl")
h.train()
h.predict(path="English-validation.jsonl",tp="HMM_English_result.txt")

h = HMM(path="Chinese-train.jsonl")
h.train()
h.predict(path="Chinese-validation.jsonl",tp="HMM_Chinese_result.txt")

### English

In [5]:
check(language = "English", gold_path="NER/English/validation.txt", my_path="HMM_English_result.txt")

              precision    recall  f1-score   support

       B-PER     0.9601    0.6922    0.8044      1842
       I-PER     0.9389    0.7643    0.8427      1307
       B-ORG     0.7866    0.7450    0.7652      1341
       I-ORG     0.8519    0.6431    0.7329       751
       B-LOC     0.9131    0.8296    0.8694      1837
       I-LOC     0.8597    0.7393    0.7950       257
      B-MISC     0.9115    0.8048    0.8548       922
      I-MISC     0.8429    0.6358    0.7249       346

   micro avg     0.8941    0.7476    0.8143      8603
   macro avg     0.8831    0.7318    0.7987      8603
weighted avg     0.8974    0.7476    0.8137      8603



### Chinese

In [6]:
check(language = "Chinese", gold_path="NER/Chinese/validation.txt", my_path="HMM_Chinese_result.txt")

              precision    recall  f1-score   support

      B-NAME     0.8667    0.8922    0.8792       102
      M-NAME     0.8333    0.8000    0.8163        75
      E-NAME     0.7714    0.7941    0.7826       102
      S-NAME     0.5000    0.7500    0.6000         8
      B-CONT     0.9706    1.0000    0.9851        33
      M-CONT     0.9846    1.0000    0.9922        64
      E-CONT     0.9706    1.0000    0.9851        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9115    0.9717    0.9406       106
       M-EDU     0.9255    0.9831    0.9534       177
       E-EDU     0.9292    0.9906    0.9589       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.8253    0.8433    0.8342       689
     M-TITLE     0.8521    0.8492    0.8507      1479
     E-TITLE     0.9446    0.9652    0.9548       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.8469    0.8793    0.8628       522
       M-ORG     0.8944    

# Part 2: Conditional Random Field

In this part we use the established pytorch crf library `pycrfsuite` to train and inference.

### English

In [7]:
sentences1 = read_data('NER/English/train.txt')
sentences2 = read_data('NER/English/validation.txt')
X_train = [sent2features(s) for s in sentences1]
y_train = [sent2labels(s) for s in sentences1]
X_test = [sent2features(s) for s in sentences2]
trainer = pycrfsuite.Trainer()

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  
    'c2': 1e-3,  
    'max_iterations': 100000,  
})

trainer.train('crf_eng.model')  
tagger = pycrfsuite.Tagger()
tagger.open('crf_eng.model')
y_pred_eng = [tagger.tag(xseq) for xseq in X_test]
tagger.close()

def write_output(data_filename,output_filename,y_pred):
        with open(data_filename, 'r',encoding='utf-8') as file:
            lines = [line.strip() for line in file.readlines()]    
        sentences = []
        sentence = []
        for line in lines:
            if line:
                word, tag = line.split()
                sentence.append((word, tag))
            else:
                if sentence: sentences.append(sentence)
                sentence = []
        with open(output_filename, 'w',encoding='utf-8') as file:
            for sentence,tags in zip(sentences,y_pred):
                words, _ = zip(*sentence)
                for word, tag in zip(words, tags):
                    file.write(f"{word} {tag}\n")
                file.write("\n")

write_output('NER/English/validation.txt','CRF_English_result.txt',y_pred_eng)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 85356
Seconds required: 0.405

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 100000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 251617.664099
Feature norm: 1.000000
Error norm: 166353.327271
Active features: 45427
Line search trials: 1
Line search step: 0.000005
Seconds required for this iteration: 0.340

***** Iteration #2 *****
Loss: 188534.778942
Feature norm: 3.001277
Error norm: 65893.646964
Active features: 44674
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.349

***** Iteration #3 *****
Loss: 158570.893995
Feature norm: 2.572491
Error norm: 51461.594024
Active features: 36781
Line search trials: 1
Line search step: 1.000000
Seconds requi

In [8]:
check(language = "English", gold_path="NER/English/validation.txt", my_path="CRF_English_result.txt")

              precision    recall  f1-score   support

       B-PER     0.8734    0.8838    0.8786      1842
       I-PER     0.9294    0.9265    0.9280      1307
       B-ORG     0.8450    0.7927    0.8180      1341
       I-ORG     0.7817    0.7963    0.7889       751
       B-LOC     0.9081    0.8498    0.8780      1837
       I-LOC     0.8810    0.7198    0.7923       257
      B-MISC     0.9216    0.8286    0.8726       922
      I-MISC     0.8703    0.7370    0.7981       346

   micro avg     0.8816    0.8445    0.8626      8603
   macro avg     0.8763    0.8168    0.8443      8603
weighted avg     0.8821    0.8445    0.8622      8603



### Chinese

In [9]:
sentences1 = read_data('NER/Chinese/train.txt')
sentences2 = read_data('NER/Chinese/validation.txt')
X_train = [sent2features(s) for s in sentences1]
y_train = [sent2labels(s) for s in sentences1]
X_test = [sent2features(s) for s in sentences2]
trainer = pycrfsuite.Trainer()

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  
    'c2': 1e-3,  
    'max_iterations': 100000,  
})

trainer.train('crf_cn.model')  

tagger = pycrfsuite.Tagger()
tagger.open('crf_cn.model')
y_pred_eng = [tagger.tag(xseq) for xseq in X_test]
tagger.close()


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 24153
Seconds required: 0.247

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 100000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 338610.893942
Feature norm: 1.000000
Error norm: 72146.078988
Active features: 16153
Line search trials: 1
Line search step: 0.000013
Seconds required for this iteration: 0.861

***** Iteration #2 *****
Loss: 265719.141790
Feature norm: 6.933123
Error norm: 125458.236298
Active features: 15911
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 1.307

***** Iteration #3 *****
Loss: 241139.740122
Feature norm: 6.983456
Error norm: 77974.899985
Active features: 16629
Line search trials: 3
Line search step: 0.250000
Seconds requi

In [11]:
write_output('NER/Chinese/validation.txt',r'CRF_Chinese_result.txt',y_pred_eng)

In [12]:
check(language = "Chinese", gold_path="NER/Chinese/validation.txt", my_path="CRF_Chinese_result.txt")

              precision    recall  f1-score   support

      B-NAME     0.9900    0.9706    0.9802       102
      M-NAME     1.0000    0.9733    0.9865        75
      E-NAME     0.9900    0.9706    0.9802       102
      S-NAME     1.0000    1.0000    1.0000         8
      B-CONT     1.0000    0.9697    0.9846        33
      M-CONT     1.0000    0.9688    0.9841        64
      E-CONT     1.0000    0.9697    0.9846        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9907    1.0000    0.9953       106
       M-EDU     0.9779    1.0000    0.9888       177
       E-EDU     0.9720    0.9811    0.9765       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.9087    0.9100    0.9094       689
     M-TITLE     0.8819    0.9189    0.9000      1479
     E-TITLE     0.9826    0.9840    0.9833       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9567    0.9310    0.9437       522
       M-ORG     0.9428    

# Part 3: Transformer + CRF
In this part, we take reference from torch's source code of CRF. Transformer is used to train the emmision matrix. CRF is used to train the transition matrix. We use SGD to train these matrices. We use the Viterbi algorithm to inference. Detailed code can be seen in `transformer_crf.py`

 Transformer + CRF
In this part, we take reference from torch's source code of CRF. Transformer is used to train the emmision matrix. CRF is used to train the transition matrix. We use SGD to train these matrices. We use the Viterbi algorithm to inference. Detailed code can be seen in `transformer_crf.py`

### English 

In [13]:
START_LABEL = "<START>"
STOP_LABEL = "<STOP>"
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
path = "English-train.jsonl"
with open(path,"r") as src:
    lines = list(src)
    sequences = [None for _ in lines]
    for idx,line in enumerate(lines):
        sequences[idx] = json.loads(line)
sequences = sequences[:]
model = TRANSFORMER_CRF(sequences, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [15]:
t = time.time()
for epoch in range(4):
    print('the',epoch,' epoch')
    print(f'Time Taken: {round(time.time()-t)} seconds')
    for seq in model.sequences:
        sentence = seq["word"]
        label = seq["label"]
        model.zero_grad()
        sentence_in = model.prepare_sequence(sentence)
        targets = torch.tensor([model.label_index[l] for l in label], dtype=torch.long)
        loss = model.neg_log_likelihood(sentence_in, targets)
        loss.backward()
        optimizer.step()

the 0  epoch
Time Taken: 0 seconds
the 1  epoch
Time Taken: 1169 seconds
the 2  epoch
Time Taken: 2377 seconds
the 3  epoch
Time Taken: 3558 seconds


In [16]:
val_path = "English-validation.jsonl"
with open(val_path,"r") as src:
    lines = list(src)
    val_sequences = [None for _ in lines]
    for idx,line in enumerate(lines):
        val_sequences[idx] = json.loads(line)

In [17]:
idx_index = {v: k for k, v in model.label_index.items()}
for seq in val_sequences:
    with torch.no_grad():
        with open(r'TRANSFORMER_CRF_English_result.txt', 'a') as file:
            sentence = seq["word"]
            a = model(model.prepare_sequence(sentence))
            for word, id in zip(sentence,a[1]):
                file.write(f"{word} {idx_index[id]}\n")
            file.write(f"\n")

In [18]:
check(language = "English", gold_path="NER/English/validation.txt", my_path="TRANSFORMER_CRF_English_result.txt")

              precision    recall  f1-score   support

       B-PER     0.4817    0.6232    0.5434      1842
       I-PER     0.5319    0.7330    0.6165      1307
       B-ORG     0.4341    0.5377    0.4803      1341
       I-ORG     0.3103    0.6471    0.4195       751
       B-LOC     0.8706    0.5787    0.6952      1837
       I-LOC     0.6839    0.4630    0.5522       257
      B-MISC     0.6806    0.6171    0.6473       922
      I-MISC     0.5660    0.4711    0.5142       346

   micro avg     0.5264    0.6076    0.5641      8603
   macro avg     0.5699    0.5839    0.5586      8603
weighted avg     0.5807    0.6076    0.5765      8603



### Chinese

In [19]:
START_LABEL = "<START>"
STOP_LABEL = "<STOP>"
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
path = "Chinese-train.jsonl"
with open(path,"r") as src:
    lines = list(src)
    cn_sequences = [None for _ in lines]
    for idx,line in enumerate(lines):
        cn_sequences[idx] = json.loads(line)
cn_sequences = cn_sequences[:]
cn_model = TRANSFORMER_CRF(cn_sequences, EMBEDDING_DIM, HIDDEN_DIM)
cn_optimizer = optim.SGD(cn_model.parameters(), lr=0.01, weight_decay=1e-4)

In [20]:
t = time.time()
for epoch in range(1):
    print('the',epoch,' epoch')
    print(f'Time Taken: {round(time.time()-t)} seconds')
    for seq in cn_model.sequences:
        sentence = seq["word"]
        label = seq["label"]
        cn_model.zero_grad()
        sentence_in = cn_model.prepare_sequence(sentence)
        targets = torch.tensor([cn_model.label_index[l] for l in label], dtype=torch.long)
        loss = cn_model.neg_log_likelihood(sentence_in, targets)
        loss.backward()
        cn_optimizer.step()

the 0  epoch
Time Taken: 0 seconds


In [24]:
cn_val_path = "Chinese-validation.jsonl"
with open(cn_val_path,"r") as src:
    lines = list(src)
    cn_val_sequences = [None for _ in lines]
    for idx,line in enumerate(lines):
        cn_val_sequences[idx] = json.loads(line)

In [27]:
cn_idx_index = {v: k for k, v in cn_model.label_index.items()}
for seq in cn_val_sequences:
    with torch.no_grad():
        with open(r'TRANSFORMER_CRF_Chinese_result.txt', 'a') as file:
            sentence = seq["word"]
            a = cn_model(cn_model.prepare_sequence(sentence))
            for word, id in zip(sentence,a[1]):
                file.write(f"{word} {cn_idx_index[id]}\n")
            file.write(f"\n")

In [28]:
check(language = "Chinese", gold_path="NER/Chinese/validation.txt", my_path="TRANSFORMER_CRF_Chinese_result.txt")

              precision    recall  f1-score   support

      B-NAME     0.9588    0.9118    0.9347       102
      M-NAME     1.0000    0.9200    0.9583        75
      E-NAME     0.9780    0.8725    0.9223       102
      S-NAME     0.0000    0.0000    0.0000         8
      B-CONT     0.9697    0.9697    0.9697        33
      M-CONT     1.0000    0.9688    0.9841        64
      E-CONT     1.0000    1.0000    1.0000        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9479    0.8585    0.9010       106
       M-EDU     0.9389    0.9548    0.9468       177
       E-EDU     0.9615    0.9434    0.9524       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.7845    0.8084    0.7963       689
     M-TITLE     0.5862    0.9723    0.7314      1479
     E-TITLE     0.9371    0.9724    0.9544       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9279    0.7893    0.8530       522
       M-ORG     0.9106    