In [1]:
import torch
import dgl
import numpy as np
import pandas as pd
import os
import pickle
from NLPUtils.preprocessUtils import removePunctuation, removeWord
from NLPUtils.DataModel import Vocabulary
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT = '.\\R8'
TRAIN = 'train.txt'
TEST = 'test.txt'
MAXLEN = 30
TOKEN = {'<PAD>' : 0}

In [3]:
removeWordList = stopwords.words('english')

df_train = pd.read_csv(os.path.join(ROOT, TRAIN), encoding='utf-8',sep='\t', header=None)
df_test = pd.read_csv(os.path.join(ROOT, TEST), encoding='utf-8', sep = '\t', header=None)
label2idx = {label : i for i, label in enumerate(df_train[0].unique())}

In [4]:
voc = Vocabulary(TOKENS=TOKEN)

In [5]:
train_sentence = []
test_sentence = []
for sentence in df_train[1]:
    sentence = removePunctuation(sentence=sentence)
    sentence = removeWord(removeWordList=removeWordList, sentence=sentence)
    train_sentence.append(sentence)
for sentence in df_test[1]:
    sentence = removePunctuation(sentence=sentence)
    sentence = removeWord(removeWordList=removeWordList, sentence=sentence)
    test_sentence.append(sentence)

In [6]:
# 計算tf-idf權重並建立詞彙庫
vector = CountVectorizer(token_pattern=r'\S+')
table = vector.fit_transform(train_sentence + test_sentence)
tfidf = TfidfTransformer()
tfidfTable = tfidf.fit_transform(table)
voc = Vocabulary(TOKENS=TOKEN)
voc.addWordList(vector.get_feature_names_out().tolist())

In [7]:
# 將資料encode為idx
train_idxs = []
for sentence in train_sentence:
    data = sentence.split()[:MAXLEN]
    train_idx = [voc.word2idx[word] for word in data]
    while len(train_idx) < MAXLEN:
        train_idx.append(voc.word2idx['<PAD>'])
    train_idxs.append(train_idx)

test_idxs = []
for sentence in test_sentence:
    data = sentence.split()[:MAXLEN]
    test_idx = []
    for word in data:
        if voc.has(word):
            test_idx.append(voc.word2idx[word])
        else:
            test_idx.append(voc.word2idx['<PAD>'])
    while len(test_idx) < MAXLEN:
        test_idx.append(voc.word2idx['<PAD>'])
    test_idxs.append(test_idx)

In [8]:
x_train = np.array(train_idxs, dtype=np.int64)
x_test = np.array(test_idxs, dtype=np.int64)
# y_train = np.array([label2idx[label] for label in df_train[0]], dtype=np.int64)
# y_test = np.array([label2idx[label] for label in df_test[0]], dtype=np.int64)

In [9]:
# np.save('./R8/y_train.npy', y_train)
# np.save('./R8/y_test.npy', y_test)

In [10]:
DOC = 'doc'
WORD = 'word'
IN  = 'in'
OCCUR = 'occur'
PRESERVE = 'preserve'
DOC_PRESERVE = (DOC, PRESERVE, DOC)
DOC_CONTAIN = (WORD, IN, DOC)
WORD_OCCUR = (WORD, OCCUR, WORD)
DIC_OFFSET = 1

In [11]:
# 建立好初始化的圖
graph_data = {
    DOC_CONTAIN : ([], []),
    DOC_PRESERVE : ([], []),
    WORD_OCCUR: ([], [])
}

In [12]:
he : dgl.DGLHeteroGraph = dgl.heterograph(graph_data)

In [13]:
for doc_idx, doc in enumerate(x_train):
    src_node = []
    dst_node = []
    feature = []
    for word_idx in doc:
        src_node.append(word_idx)
        feature.append(tfidfTable[doc_idx, word_idx - DIC_OFFSET])
        dst_node.append(doc_idx)
    he.add_edges(src_node, dst_node,data = {'w': torch.tensor(feature, dtype=torch.float32)}, etype=DOC_CONTAIN)
    
for doc_idx, doc in enumerate(x_test, start=len(x_train)):
    src_node = []
    dst_node = []
    feature = []
    for word_idx in doc:
        src_node.append(word_idx)
        feature.append(tfidfTable[doc_idx, word_idx - DIC_OFFSET])
        dst_node.append(doc_idx)
            
    he.add_edges(src_node, dst_node,data = {'w': torch.tensor(feature, dtype=torch.float32)}, etype=DOC_CONTAIN)
he = dgl.to_simple(he, copy_edata=True)
he = dgl.add_self_loop(he, etype = DOC_PRESERVE)

In [14]:
print(f'Edge {DOC_CONTAIN} : {he.edges(etype = DOC_CONTAIN)}')
print(f'Edge {DOC_PRESERVE} : {he.edges(etype = PRESERVE)}')

Edge ('word', 'in', 'doc') : (tensor([    0,     0,     0,  ..., 23439, 23439, 23440]), tensor([   9,   17,   18,  ...,  783, 5807, 2282]))
Edge ('doc', 'preserve', 'doc') : (tensor([   0,    1,    2,  ..., 7671, 7672, 7673]), tensor([   0,    1,    2,  ..., 7671, 7672, 7673]))


In [15]:
train_mask = []
for i in range(len(x_train) + len(x_test)):
    if i < len(x_train):
        train_mask.append(True)
    else:
        train_mask.append(False)
train_mask = torch.tensor(train_mask, dtype=torch.bool)

In [16]:
train_mask

tensor([ True,  True,  True,  ..., False, False, False])

In [17]:
he.nodes[DOC].data['train_mask'] = train_mask

In [18]:
print(x_train.shape)
print(x_test.shape)

(5485, 30)
(2189, 30)


In [19]:
np.concatenate((x_train,x_test), axis= 0)

array([[ 3564, 16291,  3543, ..., 17069, 18914,  1027],
       [ 4340, 20936, 20603, ...,  6221,  4235, 18274],
       [ 4030, 10360,  3370, ..., 23330, 10374,  7565],
       ...,
       [10919, 19580, 21979, ..., 13503, 14107, 16725],
       [13796, 23377,  3614, ...,  9866, 15300,  7099],
       [11078, 10433,  4436, ..., 21094, 15300, 15300]], dtype=int64)

In [20]:
he.edges[PRESERVE].data['w'] = torch.ones(size=(len(np.concatenate((x_train,x_test), axis=0)),), dtype=torch.float32)

# PPMI

In [21]:
# PPMI
WINDOW_SIZE = 20
windows_list = []
for sentence in np.concatenate((x_train, x_test), axis=0):
    windows = [sentence[i : i + WINDOW_SIZE] for i in range(len(sentence) - WINDOW_SIZE + 1)]
    windows_list += windows
windows_list = np.array(windows_list)

In [22]:
PPMI_TABLE = {}
HIT_TABLE = {}
def PPMI(a, b):
    if a > b:
        a, b = b, a
    try:
        return PPMI_TABLE[(a,b)]
    except:
        if a == b:
            return 1.
        if a not in HIT_TABLE:
            HIT_TABLE[a] = (windows_list==a).sum(axis = 1) != 0 
        if b not in HIT_TABLE:
            HIT_TABLE[b] = (windows_list==b).sum(axis = 1) != 0
        hit_a = HIT_TABLE[a]
        hit_b = HIT_TABLE[b]
        pa = hit_a.sum().astype(np.float32) /  len(windows_list)
        pb = hit_b.sum().astype(np.float32) /  len(windows_list)
        pab = (hit_a & hit_b).sum().astype(np.float32) / len(windows_list)
        ppmi = np.log(pab / pa * pb)
        PPMI_TABLE[(a,b)] = ppmi
        return PPMI_TABLE[(a,b)]

In [23]:
src_nodes = []
dst_nodes = []
edge_feats = []

In [24]:
for window in tqdm(windows_list):
    for i in range(len(window)):
        for j in range(0, len(window)):
            ppmi = PPMI(window[i], window[j])
            src_nodes.append(window[i])
            dst_nodes.append(window[j])
            edge_feats.append(ppmi)
he.add_edges(src_nodes, dst_nodes, data={'w':torch.tensor(edge_feats, dtype=torch.float32)}, etype=WORD_OCCUR)

100%|██████████| 84414/84414 [02:34<00:00, 545.10it/s] 


In [25]:
he = dgl.to_simple(he, copy_edata=True)

In [26]:
dgl.save_graphs(filename='./R8/TransductiveWin20.bin', g_list = [he])