In [1]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from model import Vocaburary
from utils import clean_str, remove_stopwords, nomalize_Adj
import dgl.sparse as dglsp
from IPython.display import clear_output

In [2]:
label_range = range(2, 53)
window_size = 20

In [3]:
root = '.\\R52PreprocessData\\'
train_path = os.path.join(root, "train_data.csv")
test_path = os.path.join(root, "test_data.csv")

In [4]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [5]:
train_df['text'] = train_df['text'].map(clean_str)
test_df['text'] = test_df['text'].map(clean_str)
train_df['text'] = train_df['text'].map(remove_stopwords)
test_df['text'] = test_df['text'].map(remove_stopwords)

In [6]:
dic_datas = {}

In [7]:
for label_num in label_range:
    clear_output(wait = True)
    print(f"Label_Num = {label_num}")
    save_path = os.path.join(root, f'R{label_num}.gh')
    # random_select_target = np.random.choice(52, size = label_num, replace = False)
    random_select_target = np.arange(label_num)
    train_select = pd.Series([False for _ in range(len(train_df))])
    test_select = pd.Series([False for _ in range(len(test_df))])
    for target in random_select_target:
        train_select |= (train_df['target'] == target)
        test_select |= (test_df['target'] == target)
    sm_train_df = train_df[train_select].copy()
    sm_test_df = test_df[test_select].copy()
    word_count = {}
    for text in tqdm(sm_train_df['text'], desc='Counting Training Word...'):
        for word in text.split():
            if word not in word_count:
                word_count.update({word: 0})
            word_count[word] +=1
    for text in tqdm(sm_test_df['text'], desc='Counting Testing Word...'):
        for word in text.split():
            if word not in word_count:
                word_count.update({word: 0})
            word_count[word] +=1
    voc = Vocaburary(word_count=word_count)

    sm_train_df['ids'] = sm_train_df['text'].map(lambda x : voc.encode(x.split()))
    sm_test_df['ids'] = sm_test_df['text'].map(lambda x : voc.encode(x.split()))

    train_word_set = set()
    test_word_set = set()
    for ids in tqdm(sm_train_df['ids'], desc='constrcuting train_word_set...'):
        id_set = set(ids)
        train_word_set.update(ids)
    for ids in tqdm(sm_test_df['ids'], desc='constructing test_word_set...'):
        id_set = set(ids)
        test_word_set.update(ids)

    ALL_df = pd.concat([sm_train_df, sm_test_df], axis=0, ignore_index=True)
    ALL_df['target'].unique()
    map_to_label = {target : label for label, target in enumerate(ALL_df['target'].unique())}
    ALL_df['target'] = ALL_df['target'].map(lambda x : map_to_label[x])
    label_num = ALL_df['target'].unique().max() + 1
    doc_word_graph = {"doc_node": [],
                "word_node" : []}
    label_word_graph = {"label_node" : [],
                        "word_node" : []}
    for doc_id, (label_id, ids) in enumerate(tqdm(zip(ALL_df['target'], ALL_df['ids']))):
        data = ids
        doc_word_graph['doc_node'] += [doc_id for _ in data]
        doc_word_graph['word_node'] += data

        label_word_graph['label_node'] += [label_id for _ in data]
        label_word_graph['word_node'] += data
    doc_num = doc_id + 1

    doc_word_mat = dglsp.spmatrix(
        indices = torch.tensor([doc_word_graph['doc_node'], doc_word_graph['word_node']]),
        shape = (doc_num, len(voc))
        )

    label_word_mat = dglsp.spmatrix(
        indices = torch.tensor([label_word_graph['label_node'], label_word_graph['word_node']]),
        shape = (label_num, len(voc))
        )
    containMat = doc_word_mat.to_dense()
    countMat = doc_word_mat.coalesce().to_dense()
    tf = (countMat.T / (countMat.sum(dim=1) + 1e-9)).T
    idf = torch.log10(containMat.shape[0] / (containMat.sum(dim=0) + 1e-9))
    doc_word_tfidf = (tf * idf).to_sparse()
    containMat = label_word_mat.to_dense()
    countMat = label_word_mat.coalesce().to_dense()
    tf = (countMat.T / (countMat.sum(dim=1) + 1e-9)).T
    idf = torch.log10(containMat.shape[0] / (containMat.sum(dim=0) + 1e-9))
    label_word_tfidf = (tf * idf).to_sparse()
    word_Y = countMat / countMat.T.sum(dim=1)
    src_dst_nodes = {}
    total_window = 0.
    for ids in tqdm(ALL_df['ids'], desc="Constructing Graph..."):
        for w in range(max(len(ids) - window_size + 1, 1)): 
            window = set(ids[w : w + window_size])
            for i in window:
                for j in window:
                    if (i, j) not in src_dst_nodes:
                        src_dst_nodes.update({(i, j) : 0})
                    src_dst_nodes[(i,j)] += 1
            total_window+=1.
    src_nodes = []
    dst_nodes = []
    values = []
    for (i,j) in tqdm(src_dst_nodes.keys(), desc='Building Graph...'):
        src_nodes.append(i)
        dst_nodes.append(j)
        values.append(src_dst_nodes[(i,j)])

    co_occurMat = dglsp.spmatrix(
        torch.tensor([src_nodes, dst_nodes]), 
        val= torch.tensor(values, dtype=torch.float32), 
        shape=(len(voc), len(voc))
        )

    p_mat = (co_occurMat.to_dense() / total_window)
    p_diag = torch.diag(p_mat)
    p_diag = p_diag.unsqueeze(1) @ p_diag.unsqueeze(0)
    word_word_PMI = (torch.log10(p_mat/(p_diag + 1e-9) + 1)).to_sparse()
    word_word_PMI = nomalize_Adj(word_word_PMI)
    whole_graph = torch.zeros(size = (word_word_PMI.shape[0] + doc_word_tfidf.shape[0] + label_word_tfidf.shape[0], word_word_PMI.shape[0] + doc_word_tfidf.shape[0] + label_word_tfidf.shape[0]))
    D = doc_word_tfidf.shape[0]
    W = word_word_PMI.shape[0]
    L = label_word_tfidf.shape[0]
    A = whole_graph.shape[0]
    whole_graph[:D, :D] = torch.ones(size = (doc_num,)).diagflat() # doc-doc identity
    whole_graph[:D,D:A-L] = doc_word_tfidf.to_dense() # tf-idf doc
    whole_graph[D:A-L, :D] = doc_word_tfidf.T.to_dense() # tf-idf doc T
    whole_graph[D:A-L, D:A-L] = word_word_PMI.to_dense() # word-word PMI
    whole_graph[D:A-L, D+W:] = label_word_tfidf.T.to_dense() # label-word T
    whole_graph[D+W:, D:A-L] = label_word_tfidf.to_dense() #label-word
    whole_graph[D+W:, D+W:] = torch.ones(size = (label_num,)).diagflat() # label-label identity
    whole_graph = whole_graph.to_sparse()
    doc_Y = torch.tensor(ALL_df['target'].to_list(), dtype=torch.int64)
    label_Y = torch.arange(0, L, 1, dtype = torch.int64)
    train_mask = torch.zeros(size=(len(ALL_df), ), dtype=torch.bool)
    train_mask[:len(sm_train_df)] = True
    train_mask[len(sm_train_df):] = False
    dic_data = {
        "voc":voc,
        'train_word' : train_word_set,
        'test_word': test_word_set,
        "whole_graph": whole_graph,
        "doc_Y":doc_Y,
        'word_Y': word_Y,
        "label_Y":label_Y,
        "train_mask": train_mask,
        "D":D,
        "W":W,
        "L":L
    }
    torch.save(dic_data, save_path)
    dic_datas[label_num] = dic_datas
print('**************')
print(f'Done!')

Label_Num = 52


Counting Training Word...: 100%|██████████| 6532/6532 [00:00<00:00, 72561.70it/s]
Counting Testing Word...: 100%|██████████| 2568/2568 [00:00<00:00, 77803.02it/s]
constrcuting train_word_set...: 100%|██████████| 6532/6532 [00:00<00:00, 343723.81it/s]
constructing test_word_set...: 100%|██████████| 2568/2568 [00:00<00:00, 427911.99it/s]
9100it [00:00, 151632.47it/s]
Constructing Graph...: 100%|██████████| 9100/9100 [00:33<00:00, 273.75it/s]
Building Graph...: 100%|██████████| 5738145/5738145 [00:02<00:00, 2172106.91it/s]


**************
Done!
