In [1]:
import torch
import dgl
import numpy as np
import pandas as pd
import os
import pickle
from NLPUtils.preprocessUtils import removePunctuation, removeWord
from NLPUtils.DataModel import Vocabulary
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT = '.\\R8'
TRAIN = 'train.txt'
TEST = 'test.txt'
MAXLEN = 30
TOKEN = {'<PAD>' : 0, '<UNK>' : 1}

In [3]:
removeWordList = stopwords.words('english')

df_train = pd.read_csv(os.path.join(ROOT, TRAIN), encoding='utf-8',sep='\t', header=None)
df_test = pd.read_csv(os.path.join(ROOT, TRAIN), encoding='utf-8', sep = '\t', header=None)
label2idx = {label : i for i, label in enumerate(df_train[0].unique())}

In [4]:
voc = Vocabulary(TOKENS=TOKEN)

In [5]:
train_sentence = []
test_sentence = []
for sentence in df_train[1]:
    sentence = removePunctuation(sentence=sentence)
    sentence = removeWord(removeWordList=removeWordList, sentence=sentence)
    train_sentence.append(sentence)
for sentence in df_test[1]:
    sentence = removePunctuation(sentence=sentence)
    sentence = removeWord(removeWordList=removeWordList, sentence=sentence)
    test_sentence.append(sentence)

In [6]:
# 計算tf-idf權重並建立詞彙庫
vector = CountVectorizer(token_pattern=r'\S+')
table = vector.fit_transform(train_sentence)
tfidf = TfidfTransformer()
tfidfTable = tfidf.fit_transform(table)


In [7]:
voc = Vocabulary(TOKENS=TOKEN)
voc.addWordList(vector.get_feature_names_out().tolist())

In [8]:
# 將資料encode為idx
train_idxs = []
for sentence in train_sentence:
    data = sentence.split()[:MAXLEN]
    train_idx = [voc.word2idx[word] for word in data]
    while len(train_idx) < MAXLEN:
        train_idx.append(voc.word2idx['<PAD>'])
    train_idxs.append(train_idx)

test_idxs = []
for sentence in test_sentence:
    data = sentence.split()[:MAXLEN]
    test_idx = []
    for word in data:
        if voc.has(word):
            test_idx.append(voc.word2idx[word])
        else:
            test_idx.append(voc.word2idx['<UNK>'])
    while len(test_idx) < MAXLEN:
        test_idx.append(voc.word2idx['<PAD>'])
    test_idxs.append(test_idx)

In [9]:
x_train = np.array(train_idxs, dtype=np.int64)
x_test = np.array(test_idxs, dtype=np.int64)

In [10]:
DOC = 'doc'
WORD = 'word'
IN  = 'in'
OCCUR = 'occur'
PRESERVE = 'preserve'
DOC_PRESERVE = (DOC, PRESERVE, DOC)
DOC_CONTAIN = (WORD, IN, DOC)
WORD_OCCUR = (WORD, OCCUR, WORD)
DIC_OFFSET = 2

In [11]:
# 建立好初始化的圖
graph_data = {
    DOC_CONTAIN : ([], []),
    DOC_PRESERVE : ([], []),
    WORD_OCCUR: ([], [])
}

In [12]:
he : dgl.DGLHeteroGraph = dgl.heterograph(graph_data)

In [13]:
for doc_idx, doc in enumerate(x_train):
    dst_node = [doc_idx] * len(doc)
    feature = []
    for word_idx in doc:
        feature.append(tfidfTable[doc_idx, word_idx - DIC_OFFSET])
    he.add_edges(doc, dst_node,data = {'w': torch.tensor(feature, dtype=torch.float32)}, etype=DOC_CONTAIN)
he = dgl.to_simple(he, copy_edata=True)
he = dgl.add_self_loop(he, etype = DOC_PRESERVE)

In [14]:
print(f'Edge {DOC_CONTAIN} : {he.edges(etype = DOC_CONTAIN)}')
print(f'Edge {DOC_PRESERVE} : {he.edges(etype = PRESERVE)}')

Edge ('word', 'in', 'doc') : (tensor([    0,     0,     0,  ..., 19840, 19840, 19841]), tensor([   9,   17,   18,  ...,  782,  783, 2282]))
Edge ('doc', 'preserve', 'doc') : (tensor([   0,    1,    2,  ..., 5482, 5483, 5484]), tensor([   0,    1,    2,  ..., 5482, 5483, 5484]))


In [15]:
he.edges[PRESERVE].data['w'] = torch.ones(size=(len(x_train),), dtype=torch.float32)

# PPMI

In [16]:
# PPMI
WINDOW_SIZE = 3
windows_list = []
for sentence in x_train:
    windows = [sentence[i : i + WINDOW_SIZE] for i in range(len(sentence) - WINDOW_SIZE + 1)]
    windows_list += windows
windows_list = np.array(windows_list)

In [17]:
PPMI_TABLE = {}
def PPMI(a, b):
    if a > b:
        a, b = b, a
    if (a, b) in PPMI_TABLE:
        return PPMI_TABLE[(a,b)]
    pa = ((windows_list==a).sum(axis = 1) != 0).sum().astype(np.float32) /  len(windows_list)
    pb = ((windows_list==b).sum(axis = 1) != 0).sum().astype(np.float32) /  len(windows_list)
    pab  = (((windows_list==a).sum(axis = 1) != 0) & ((windows_list==b).sum(axis = 1) != 0)).sum().astype(np.float32) / len(windows_list)
    ppmi = np.log(np.exp(pab / pa * pb))
    PPMI_TABLE[(a,b)] = ppmi
    return PPMI_TABLE[(a,b)]

In [18]:
for window in tqdm(windows_list):
    for i in range(len(window)):
        for j in range(0, len(window)):
            if not he.has_edges_between(window[i], window[j], etype=WORD_OCCUR):
                if i == j:
                    he.add_edges(window[i], window[j], data={'w':torch.tensor([1.], dtype=torch.float32)}, etype=WORD_OCCUR)
                else:
                    ppmi = PPMI(window[i], window[j])
                    he.add_edges(window[i], window[j], data={'w':torch.tensor([ppmi], dtype=torch.float32)}, etype=WORD_OCCUR)

100%|██████████| 153580/153580 [36:47<00:00, 69.56it/s] 


In [19]:
dgl.save_graphs(filename='./R8/Transductive.bin', g_list = [he])