In [1]:
import numpy as np
import pandas as pd
from NLPUtils.preprocessUtils import removePunctuation, removeWord
from NLPUtils.DataModel import Vocabulary
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from tqdm import tqdm

In [2]:
MAXLEN = 30
TOKEN = {'<PAD>' : 0}
removeWordList = stopwords.words('english')

In [3]:
df = pd.read_csv('./R8/train.txt', encoding='utf-8',sep='\t', header=None)

In [4]:
train_sentence = []
for sentence in df[1]:
    sentence = removePunctuation(sentence=sentence)
    sentence = removeWord(removeWordList=removeWordList, sentence=sentence)
    train_sentence.append(sentence)
vector = CountVectorizer(token_pattern=r'\S+')
table = vector.fit_transform(train_sentence)
tfidf = TfidfTransformer()
tfidfTable = tfidf.fit_transform(table)
voc = Vocabulary(TOKENS=TOKEN)
voc.addWordList(vector.get_feature_names_out().tolist())

train_idxs = []
for sentence in train_sentence:
    data = sentence.split()[:MAXLEN]
    train_idx = [voc.word2idx[word] for word in data]
    while len(train_idx) < MAXLEN:
        train_idx.append(voc.word2idx['<PAD>'])
    train_idxs.append(train_idx)


x_train = np.array(train_idxs, dtype=np.int64)
WINDOW_SIZE = 3
windows_list = []
for sentence in x_train:
    windows = [sentence[i : i + WINDOW_SIZE] for i in range(len(sentence) - WINDOW_SIZE + 1)]
    windows_list += windows
windows_list = np.array(windows_list)

In [5]:
PPMI_TABLE = {}
HIT_TABLE = {}
def PPMI(a, b):
    if a > b:
        a, b = b, a
    try:
        return PPMI_TABLE[(a,b)]
    except:
        if a == b:
            return 1.
        if a not in HIT_TABLE:
            HIT_TABLE[a] = (windows_list==a).sum(axis = 1) != 0 
        if b not in HIT_TABLE:
            HIT_TABLE[b] = (windows_list==b).sum(axis = 1) != 0
        hit_a = HIT_TABLE[a]
        hit_b = HIT_TABLE[b]
        pa = hit_a.sum().astype(np.float32) /  len(windows_list)
        pb = hit_b.sum().astype(np.float32) /  len(windows_list)
        pab = (hit_a & hit_b).sum().astype(np.float32) / len(windows_list)
        ppmi = np.log(pab / pa * pb)
        PPMI_TABLE[(a,b)] = ppmi
        return PPMI_TABLE[(a,b)]

In [6]:
wordNodes = {}
for window in tqdm(windows_list):
    for i in range(len(window)):
        for j in range(0, len(window)):
            if((window[i], window[j]) not in wordNodes):
                ppmi = PPMI(window[i], window[j])
                wordNodes[ (window[i], window[j]) ] = ppmi
src_nodes = []
dst_nodes = []
edge_feats = []
for (u, v), w in tqdm(wordNodes.items()):
    src_nodes.append(u)
    dst_nodes.append(v)
    edge_feats.append(w)

100%|██████████| 153580/153580 [00:55<00:00, 2757.34it/s]
100%|██████████| 244110/244110 [00:00<00:00, 3089282.25it/s]


In [7]:
wordNodes = set()
for window in tqdm(windows_list):
    for i in range(len(window)):
        for j in range(0, len(window)):
            ppmi = PPMI(window[i], window[j])
            wordNodes.add((window[i], window[j], ppmi))
src_nodes = []
dst_nodes = []
edge_feats = []
for u, v, w in tqdm(wordNodes):
    src_nodes.append(u)
    dst_nodes.append(v)
    edge_feats.append(w)

100%|██████████| 153580/153580 [00:01<00:00, 128168.02it/s]
100%|██████████| 244110/244110 [00:00<00:00, 2068264.42it/s]
