In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import re

import read_data

In [2]:
# 讀取數據

sentences, label_seq = read_data.read_ner_data('ner_training_data.txt')


In [3]:
# 構建詞彙表

word_tokenizer = Tokenizer(lower=False)  # 創建一個Tokenizer對象
word_tokenizer.fit_on_texts(sentences)   # 更新內部詞彙表基於文本列表
vocab_size = len(word_tokenizer.word_index) + 1  # +1是因為word_index從1開始計數

In [4]:
# 構建標籤詞彙表

label_tokenizer = Tokenizer(lower=False)
label_tokenizer.fit_on_texts(label_seq)
label_size = len(label_tokenizer.word_index) + 1

In [5]:
# 將單詞轉換為整數索引
X_data = word_tokenizer.texts_to_sequences(sentences)

# 將標籤轉換為整數索引
y_data = label_tokenizer.texts_to_sequences(label_seq)

In [6]:
# 填充序列到相同的長度
max_len = max(len(s) for s in X_data)  # 找到最長的句子長度

X_data = pad_sequences(X_data, maxlen=max_len, padding='post')
y_data = pad_sequences(y_data, maxlen=max_len, padding='post')

In [7]:
# 將標籤轉換為獨熱編碼
y_data = [to_categorical(i, num_classes=label_size) for i in y_data]
y_data = np.array(y_data)

In [8]:
# 建立模型

embedding_dim = 64  # 嵌入维度
max_seq_length = max_len  # 最大序列长度
num_tags = label_size  # 标签数量，包括'O'和上述目标实体标签

# 输入层
input_word = Input(shape=(max_seq_length,))
# 嵌入层
model = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length)(input_word)
# LSTM层
model = LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)(model)
model = LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)(model)
# 输出层
out = TimeDistributed(Dense(num_tags, activation="softmax"))(model)

# 构建模型
model = Model(input_word, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# 模型摘要
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 embedding (Embedding)       (None, 1, 64)             5270336   
                                                                 
 lstm (LSTM)                 (None, 1, 100)            66000     
                                                                 
 lstm_1 (LSTM)               (None, 1, 100)            80400     
                                                                 
 time_distributed (TimeDistr  (None, 1, 8)             808       
 ibuted)                                                         
                                                                 
Total params: 5,417,544
Trainable params: 5,417,544
Non-trainable params: 0
___________________________________________________

In [9]:
# 切割訓練資料

from sklearn.model_selection import train_test_split

# 假設 X_data 和 y_data 是您的完整訓練資料和標籤
# 切割出一部分作為驗證資料，這裡我們切割出20%作為驗證資料
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=42)



In [10]:
# 假设X_train, y_train是你的训练数据和标签
# 假设X_val, y_val是你的验证数据和标签

# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 定义早停
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

# 训练模型
history = model.fit(X_train, y_train, batch_size=32, epochs=5,
                    validation_data=(X_val, y_val), callbacks=[early_stopping])

# 评估模型
# test_loss, test_accuracy = model.evaluate(X_test, y_test)
# print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


Epoch 1/5


KeyboardInterrupt: 

: 

In [None]:
import numpy as np
from sklearn.utils import shuffle

# 假設 X_data 是你的輸入數據，y_data 是對應的標籤數據
# 假設 model 是你已經訓練好的模型
# 假設 word_tokenizer 和 label_tokenizer 是你的 tokenizer

# 隨機選擇20筆資料
X_data_shuffled, y_data_shuffled = shuffle(X_data, y_data, random_state=0)
X_sample = X_data_shuffled[:20]
y_sample = y_data_shuffled[:20]

# 使用模型進行預測
predictions = model.predict(X_sample)

# 將預測結果轉換為標籤
predicted_label_indices = np.argmax(predictions, axis=-1)
# predicted_labels = [[label_tokenizer.index_word.get(index, 'PAD') for index in sequence] for sequence in predicted_label_indices]

predicted_labels = []
for sequence in predicted_label_indices:
    sequence_labels = []
    for index in sequence:
        label = label_tokenizer.index_word.get(index, 'PAD')  # 將索引轉換為標籤
        
        sequence_labels.append(label)
    predicted_labels.append(sequence_labels)

# 打印出原始資料的實體和對應的預測標籤
for i in range(5):
    print(f"Sample {i+1}:")
    original_text = [word_tokenizer.index_word.get(w, '') for w in X_sample[i] if w != 0]
    original_labels = [label_tokenizer.index_word.get(l, '') for l in np.argmax(y_sample[i], axis=-1) if l != 0]
    predicted_labels_text = predicted_labels[i]
    # print(predicted_labels_text)
    
    # 組合實體和標籤
    for word, true_label, pred_label in zip(original_text, original_labels, predicted_labels_text):
        print(f"Text:{word},\nTrue label:{true_label},\nPredicted label:{pred_label}\n")
    


Sample 1:
Text:247-858-2340,
True label:PHONE,
Predicted label:PHONE

Sample 2:
Text:Stanley LLC Hospitality,
True label:ORG,
Predicted label:ORG

Sample 3:
Text:Moyer, Mitchell and Wallace Education,
True label:ORG,
Predicted label:ORG

Sample 4:
Text:(259) 623-9665,
True label:PHONE,
Predicted label:PHONE

Sample 5:
Text:Quality Assurance Engineer,
True label:POSITION,
Predicted label:POSITION



In [None]:
# text = """JOHN SMITH
# GRAPHIC DESIGNER
# ABDUL STUDIO
# 000-123-4567
# DESIGN AGENCY
# 000-123-4567
# www.websiteurl.com       
# info@websiteurl.com      
# 255 John Street, Country,
# New york, 5255"""

text = """Thomas Deleon
(259) 623-9665
Hotel Manager"""

# Preprocess the text
# Splitting the text into lines as each line is considered an entity
entities = text.split('\n')

# Tokenize the entities using the provided tokenizer
tokenized_entities = word_tokenizer.texts_to_sequences(entities)

# Pad the sequences to the maximum sequence length used during training
padded_entities = pad_sequences(tokenized_entities, maxlen=max_len, padding='post')

# Make predictions using the model
predictions = model.predict(padded_entities)

# Convert predictions to readable labels
predicted_label_indices = np.argmax(predictions, axis=-1)
predicted_labels = [[label_tokenizer.index_word.get(index, 'PAD') for index in sequence] for sequence in predicted_label_indices]

i=0
# Displaying the predictions
for entity, label_seq in zip(entities, predicted_labels):
    # Joining the predicted labels for each entity
    # predicted_label = ' '.join([label for label in label_seq if label != 'PAD'])
    predicted_label = predicted_labels
    print(f"Entity: {entity}, Predicted Label: {predicted_label}")
    i = i + 1

Entity: Thomas Deleon, Predicted Label: [['PERSON', 'PAD'], ['PERSON', 'PAD'], ['PERSON', 'PAD']]
Entity: (259) 623-9665, Predicted Label: [['PERSON', 'PAD'], ['PERSON', 'PAD'], ['PERSON', 'PAD']]
Entity: Hotel Manager, Predicted Label: [['PERSON', 'PAD'], ['PERSON', 'PAD'], ['PERSON', 'PAD']]
