In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import pickle
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

from readata import readata

In [2]:
data = readata("ner_training_data.txt")

In [3]:
# 將數據轉換為DataFrame
df = pd.DataFrame(data)

In [4]:
# 文本預處理

# 創建一個Tokenizer對象，它是Keras提供的一個用於文本預處理的工具。
# num_words=5000 表示Tokenizer將只考慮數據集中最常見的5000個詞。
tokenizer = Tokenizer(num_words=5000)

# 將Tokenizer與您的文本數據（texts）進行擬合。
# 它會分析您的文本，創建一個詞彙索引（每個唯一詞對應一個索引值）。
tokenizer.fit_on_texts(data["entity"])

# 將文本轉換為序列。
# 每個文本（如句子或單詞）被轉換為一系列整數，其中每個整數代表該詞在Tokenizer詞彙索引中的位置。
sequences = tokenizer.texts_to_sequences(data["entity"])

# 提取了Tokenizer創建的詞彙索引。
# word_index 是一個字典，其中每個詞映射到一個唯一的整數。
word_index = tokenizer.word_index

# 將所有序列填充或截斷到相同的長度（在這個例子中是50）。
# 這對於準備數據輸入到深度學習模型中非常重要，因為模型通常需要固定長度的輸入。
pad_data = pad_sequences(sequences, maxlen=50)

In [5]:
# 儲存 tokenizer
with open('ner_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
# 標籤編碼
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(data["label"])
labels = data["label"]

In [7]:
# 保存 label_encoder
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

In [8]:
# 建立模型
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, input_length=50))
model.add(LSTM(256))
model.add(Dense(len(set(labels)), activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           3807200   
                                                                 
 lstm (LSTM)                 (None, 256)               365568    
                                                                 
 dense (Dense)               (None, 5)                 1285      
                                                                 
Total params: 4,174,053
Trainable params: 4,174,053
Non-trainable params: 0
_________________________________________________________________


In [9]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(pad_data, encoded_labels, test_size=0.2)

In [11]:
history = model.fit(X_train, y_train, batch_size=32, epochs=1,
                    validation_data=(X_test, y_test), callbacks=[early_stopping])



In [12]:
# from sklearn.utils import shuffle

# # 隨機選擇20筆資料
# X_data_shuffled, y_data_shuffled = shuffle(X_test, y_test, random_state=0)
# X_sample = X_data_shuffled[:20]
# y_sample = y_data_shuffled[:20]

# # 使用模型進行預測
# predictions = model.predict(X_sample)

# # 將預測結果轉換為標籤
# predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# # 將實際標籤轉換回原始標籤
# actual_labels = label_encoder.inverse_transform(y_sample)

# # 選擇20筆數據來查看
# for i in range(20):
#     # 將數字序列轉換回文本
#     text = ' '.join([tokenizer.index_word.get(word, '?') for word in X_sample[i] if word != 0])
#     print(f"實體: {text}")
#     print(f"實際標籤: {actual_labels[i]}")
#     print(f"預測標籤: {predicted_labels[i]}")
#     print("-----")


In [13]:
# 要預測的新文本
texts = """JOHN SMITH
GRAPHIC DESIGNER
ABDUL STUDIO
(123)000-123-4567
DESIGN AGENCY

www.websiteurl.com   
info@websiteurl.com    
255 John Street, Country,
New york, 5255"""

card_info = {'Email': [], 'Link': []}

texts = texts.split("\n")
newtexts = [i.strip() for i in texts if len(i) != 0]

for text in newtexts:
    if re.search(r'\S+@\S+\.\S+', text):
        card_info['Email'].append(text)
    elif re.search(r'www\.\S+\.\S+', text):
        card_info['Link'].append(text)

newtexts = [text for text in newtexts if not re.search(r'\S+@\S+\.\S+', text)]
newtexts = [text for text in newtexts if not re.search(r'www\.\S+\.\S+', text)]

In [14]:
from text_preprocessing import text_preprocessing

new_pad_data = text_preprocessing(newtexts)
# 使用相同的Tokenizer對新文本進行預處理
# new_sequences = tokenizer.texts_to_sequences(newtexts)
# new_pad_data = pad_sequences(new_sequences, maxlen=50)

# 使用模型進行預測
predictions = model.predict(new_pad_data)

# 將預測結果轉換為標籤
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# 輸出預測結果
for i, text in enumerate(newtexts):
    print(f"文本: {text}")
    print(f"預測標籤: {predicted_labels[i]}")
    if predicted_labels[i] not in card_info:
        card_info[str(predicted_labels[i])] = [text]
    else:
        card_info[str(predicted_labels[i])].append(text)

文本: JOHN SMITH
預測標籤: PERSON
文本: GRAPHIC DESIGNER
預測標籤: POSITION
文本: ABDUL STUDIO
預測標籤: ORG
文本: (123)000-123-4567
預測標籤: PHONE
文本: DESIGN AGENCY
預測標籤: ORG
文本: 255 John Street, Country,
預測標籤: ADDRESS
文本: New york, 5255
預測標籤: ADDRESS


In [15]:
card_info

{'Email': ['info@websiteurl.com'],
 'Link': ['www.websiteurl.com'],
 'PERSON': ['JOHN SMITH'],
 'POSITION': ['GRAPHIC DESIGNER'],
 'ORG': ['ABDUL STUDIO', 'DESIGN AGENCY'],
 'PHONE': ['(123)000-123-4567'],
 'ADDRESS': ['255 John Street, Country,', 'New york, 5255']}

In [16]:
from label_processing import email_link_preprocessing

texts = """MICHAL JOHNS
Solution Manager

Real Estate

Leceria Co.
+000 12345 6789
+000 12345 6789
urname@email.com
urwebsitename.com
Street Address Here
Singapore, 2222"""

# card_info = {'Email': [], 'Link': []}

# texts = texts.split("\n")
# newtexts = [i.strip() for i in texts if len(i) != 0]

# # 處理文本列表
# for text in newtexts:
#     # 匹配電子郵件地址
#     if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text):
#         card_info['Email'].append(text)
#     # 匹配網址
#     elif re.search(r'(http[s]?://)?[www\.]?[A-Za-z0-9.-]+\.[A-Za-z]{2,}', text):
#         card_info['Link'].append(text)

# newtexts = [text for text in newtexts if not re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)]
# newtexts = [text for text in newtexts if not re.search(r'(http[s]?://)?[www\.]?[A-Za-z0-9.-]+\.[A-Za-z]{2,}', text)]

card_info, newtexts = email_link_preprocessing(texts)
card_info, newtexts

({'Email': ['urname@email.com'], 'Link': ['urwebsitename.com']},
 ['MICHAL JOHNS',
  'Solution Manager',
  'Real Estate',
  'Leceria Co.',
  '+000 12345 6789',
  '+000 12345 6789',
  'Street Address Here',
  'Singapore, 2222'])

In [17]:
# 使用相同的Tokenizer對新文本進行預處理
# new_sequences = tokenizer.texts_to_sequences(newtexts)
# new_pad_data = pad_sequences(new_sequences, maxlen=50)
from text_preprocessing import text_preprocessing

new_pad_data = text_preprocessing(newtexts)

# 使用模型進行預測
predictions = model.predict(new_pad_data)

# 將預測結果轉換為標籤
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# 輸出預測結果
for i, text in enumerate(newtexts):
    print(f"文本: {text}")
    print(f"預測標籤: {predicted_labels[i]}")
    if predicted_labels[i] not in card_info:
        card_info[str(predicted_labels[i])] = [text]
    else:
        card_info[str(predicted_labels[i])].append(text)

文本: MICHAL JOHNS
預測標籤: PERSON
文本: Solution Manager
預測標籤: POSITION
文本: Real Estate
預測標籤: ORG
文本: Leceria Co.
預測標籤: ADDRESS
文本: +000 12345 6789
預測標籤: PHONE
文本: +000 12345 6789
預測標籤: PHONE
文本: Street Address Here
預測標籤: ADDRESS
文本: Singapore, 2222
預測標籤: ADDRESS


In [18]:
card_info

{'Email': ['urname@email.com'],
 'Link': ['urwebsitename.com'],
 'PERSON': ['MICHAL JOHNS'],
 'POSITION': ['Solution Manager'],
 'ORG': ['Real Estate'],
 'ADDRESS': ['Leceria Co.', 'Street Address Here', 'Singapore, 2222'],
 'PHONE': ['+000 12345 6789', '+000 12345 6789']}

In [19]:
# 保存模型
model.save('ner_predict_model.h5')