In [1]:
import torch
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import dgl.sparse as dglsp
from nltk.corpus import stopwords
from utils import clean_str, remove_stopwords, nomalize_Adj
from model import Vocaburary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT = './ProcessedData'
TRAIN_DATA = 'train_data.csv'
TEST_DATA = 'test_data.csv'
TARGET_DATA = 'WholeGraphDict.gh'
WINDOW_SIZE = 20
dataset = 'mr'

In [3]:
print(f'Current Dataset = {dataset}')
file_path = os.path.join(ROOT, dataset)
train_df = pd.read_csv(os.path.join(file_path, TRAIN_DATA), index_col=False)
test_df = pd.read_csv(os.path.join(file_path, TEST_DATA), index_col=False)

Current Dataset = mr


In [4]:
train_df['text'] = train_df['text'].map(clean_str)
test_df['text'] = test_df['text'].map(clean_str)
if dataset != 'mr':
    train_df['text'] = train_df['text'].map(remove_stopwords)
    test_df['text'] = test_df['text'].map(remove_stopwords)

word_count = {}
for text in tqdm(train_df['text'], desc='Counting Word...'):
    for word in text.split():
        if word not in word_count:
            word_count.update({word: 0})
        word_count[word] +=1
for text in tqdm(test_df['text'], desc='Counting Word...'):
    for word in text.split():
        if word not in word_count:
            word_count.update({word: 0})
        word_count[word] +=1
if dataset == 'mr':
    voc = Vocaburary(word_count=word_count)
else:
    voc = Vocaburary(word_count=word_count, min_time = 5)

train_df['ids'] = train_df['text'].map(lambda x : voc.encode(x.split()))
test_df['ids'] = test_df['text'].map(lambda x : voc.encode(x.split()))

Counting Word...: 100%|██████████| 7108/7108 [00:00<00:00, 253441.75it/s]
Counting Word...: 100%|██████████| 3554/3554 [00:00<00:00, 273309.19it/s]


In [5]:
train_word_set = set()
test_word_set = set()
for ids in tqdm(train_df['ids'], desc='constrcuting train_word_set...'):
    id_set = set(ids)
    train_word_set.update(ids)
for ids in tqdm(test_df['ids'], desc='constructing test_word_set...'):
    id_set = set(ids)
    test_word_set.update(ids)

constrcuting train_word_set...: 100%|██████████| 7108/7108 [00:00<00:00, 789688.58it/s]
constructing test_word_set...: 100%|██████████| 3554/3554 [00:00<00:00, 645221.68it/s]


In [6]:
# doc_num = doc_id + 1
label_num = train_df['target'].unique().max() + 1

In [7]:
train_word_label = {
    'label_node' : [],
    'word_node': []
}
for label_id, ids in tqdm(zip(train_df['target'], train_df['ids'])):
    data = ids
    train_word_label['label_node'] += [label_id for _ in data]
    train_word_label['word_node'] += data
label_word_mat = torch.sparse_coo_tensor(
    indices=[train_word_label['label_node'], train_word_label['word_node']],
    values=[1. for _ in range(len(train_word_label['word_node']))],
    size=(label_num, len(voc))
)
countMat = label_word_mat.coalesce().to_dense()
train_word = (countMat / countMat.T.sum(dim=1)).nan_to_num()

7108it [00:00, 710699.01it/s]


In [8]:
train_word

tensor([[0.3333, 0.2500, 1.0000,  ..., 0.4545, 1.0000, 0.4815],
        [0.6667, 0.7500, 0.0000,  ..., 0.5455, 0.0000, 0.5185]])