In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from model import *

START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

training_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]

word2idx = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

tag2idx = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

# define the model
model = BiLSTM_CRF(len(word2idx), EMBEDDING_DIM, HIDDEN_DIM, tag2idx, START_TAG, STOP_TAG)

In [3]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

sentence, tag = training_data[0]
sentence_in = prepare_sequence(sentence, word2idx)

features = model.get_features(sentence_in)
print(sentence_in)
print(features)
print(features.shape)
alpha = model.calc_scores(features)
print(alpha)
path_score, best_path = model.decode(features)
print(path_score)
print(best_path)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
tensor([[-0.0684, -0.3112, -0.1691,  0.3262,  0.3105],
        [-0.0867, -0.3095, -0.2335,  0.3194,  0.4648],
        [-0.0951, -0.4472, -0.2582,  0.2855,  0.3742],
        [-0.1207, -0.4220, -0.2632,  0.2542,  0.4539],
        [-0.1121, -0.4278, -0.2137,  0.3642,  0.3620],
        [-0.0820, -0.3368, -0.1868,  0.5578,  0.3885],
        [-0.0802, -0.3743, -0.2326,  0.4058,  0.3991],
        [-0.1559, -0.5247, -0.2061,  0.3822,  0.3351],
        [-0.0976, -0.3575, -0.1968,  0.3736,  0.3791],
        [-0.0958, -0.3701, -0.1868,  0.3861,  0.3464],
        [-0.1208, -0.4290, -0.2268,  0.4453,  0.4182]],
       grad_fn=<AddmmBackward0>)
torch.Size([11, 5])
tensor(11.1339, grad_fn=<AddBackward0>)
tensor(6.4198, grad_fn=<SelectBackward0>)
[1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2]


In [4]:
import numpy as np
import os

datadir = '/Users/henryliu/Desktop/Henry/学习/untitled folder/大三/大三下/自然语言处理/labs/project3/train.txt'
# read txt file, return a list of sentences
def read_txt(datadir):
    with open(datadir, 'r') as f:
        data = f.readlines()
    data = [line.strip() for line in data]
    data = [line.split() for line in data]
    return data

data = read_txt(datadir)
for i in range(10):
    print(len(data[i]))

# get maximum length of sentences
def get_max_length(data):
    max_length = 0
    for sentence in data:
        if len(sentence) > max_length:
            max_length = len(sentence)
    return max_length

max_length = get_max_length(data)
print(max_length)


70
84
90
133
140
119
180
119
210
170
1511


In [27]:
import dataloader
import argparse
import pickle

datadir = '/Users/henryliu/Documents/GitHub/data/project3/processed_data'

# load data
word2idx = pickle.load(open(os.path.join(datadir, 'word2idx.dat'), 'rb'))
word2idx



{'<UNK>': 0,
 ',': 1,
 '的': 2,
 '。': 3,
 '一': 4,
 '、': 5,
 '1': 6,
 '人': 7,
 '0': 8,
 '在': 9,
 '是': 10,
 '“': 11,
 '”': 12,
 '了': 13,
 '有': 14,
 '不': 15,
 '中': 16,
 '年': 17,
 '2': 18,
 '大': 19,
 '国': 20,
 '上': 21,
 '为': 22,
 '和': 23,
 '日': 24,
 '时': 25,
 '到': 26,
 '发': 27,
 '会': 28,
 '生': 29,
 '出': 30,
 '这': 31,
 '3': 32,
 '个': 33,
 '行': 34,
 '对': 35,
 '来': 36,
 '家': 37,
 '公': 38,
 '地': 39,
 '要': 40,
 '以': 41,
 '市': 42,
 '后': 43,
 '作': 44,
 '工': 45,
 '新': 46,
 '他': 47,
 '成': 48,
 '多': 49,
 '月': 50,
 '方': 51,
 '现': 52,
 '部': 53,
 '我': 54,
 '4': 55,
 '5': 56,
 '者': 57,
 '过': 58,
 '民': 59,
 ':': 60,
 '业': 61,
 '前': 62,
 '车': 63,
 '能': 64,
 '子': 65,
 '全': 66,
 '开': 67,
 '于': 68,
 '也': 69,
 '进': 70,
 '下': 71,
 '经': 72,
 '就': 73,
 '自': 74,
 '法': 75,
 '用': 76,
 '学': 77,
 '动': 78,
 '天': 79,
 '实': 80,
 '小': 81,
 '分': 82,
 '高': 83,
 '将': 84,
 '说': 85,
 '事': 86,
 '区': 87,
 '关': 88,
 '们': 89,
 '等': 90,
 '可': 91,
 '记': 92,
 '6': 93,
 '员': 94,
 '机': 95,
 '报': 96,
 '9': 97,
 '定': 98,
 '理': 99,
 '政': 