<a href="https://colab.research.google.com/github/armancohan/cpsc477-internal/blob/main/hw3/part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CPSC 477/577 Project Spring 2024

## Name and NetID


Group Member: Shurui Wang; Lang Ding; Weiyi You

In [16]:
# do not run if you are using school cluster
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# do not run if you are using school cluster
!pip install klepto



In [2]:
# util.py
from collections import OrderedDict
import datetime
import klepto
from os.path import dirname, abspath, join, expanduser, isfile, exists
from os import environ, makedirs, getcwd
import pytz
import re
from socket import gethostname

def ensure_prefix(filepath):
    # Define the base path prefix
    prefix = '/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/'

    # Check if the prefix is already in the filepath
    if not filepath.startswith(prefix):
        # If not, add the prefix
        filepath = join(prefix, filepath.lstrip('/'))  # Ensure no leading '/' to avoid treating it as absolute

    return filepath


def get_root_path():
    return getcwd()  # 获取并返回当前工作目录

def get_data_path():
    return join(get_root_path(), '/CPSC_577_FP')


def get_corpus_path():
    return join(get_data_path())


def get_save_path():
    return join(get_root_path(), '/CPSC_577_FP/save')


def load(filepath, print_msg=True):
    fp = proc_filepath(filepath)
    if isfile(fp):
        return load_klepto(fp, print_msg)
    elif print_msg:
        print('Trying to load but no file {}'.format(fp))


def load_klepto(filepath, print_msg):
    rtn = klepto.archives.file_archive(filepath)
    rtn.load()
    global logs
    if logs:
        if print_msg:
            print('Loaded from {}'.format(filepath))
    return rtn


def save(obj, filepath, print_msg=True):
    filepath = ensure_prefix(filepath)
    fp = proc_filepath(filepath, ext='.klepto')
    create_dir_if_not_exists(dirname(filepath))
    save_klepto(obj, fp, print_msg)


def create_dir_if_not_exists(dir):
    if not exists(dir):
        makedirs(dir)


def save_klepto(dic, filepath, print_msg):
    global logs
    if logs:
        print('filepath to save:', filepath)
        if print_msg:
            print('Saving to {}'.format(filepath))
    klepto.archives.file_archive(filepath, dict=dic).dump()


def proc_filepath(filepath, ext='.klepto'):
    global logs
    if logs:
        print('filepath:', filepath)
    filepath = ensure_prefix(filepath)
    if logs:
        print('filepath:', filepath)
    if type(filepath) is not str:
        raise RuntimeError('Did you pass a file path to this function?')
    return append_ext_to_filepath(ext, filepath)


def append_ext_to_filepath(ext, fp):
    if not fp.endswith(ext):
        fp += ext
    return fp


def parse_as_int_list(il):
    rtn = []
    for x in il.split('_'):
        x = int(x)
        rtn.append(x)
    return rtn


def get_user():
    try:
        home_user = expanduser("~").split('/')[-1]
    except:
        home_user = 'user'
    return home_user


def get_host():
    host = environ.get('HOSTNAME')
    if host is not None:
        return host
    return gethostname()

tstamp = None


def get_ts():
    global tstamp
    if not tstamp:
        tstamp = get_current_ts()
    return tstamp


def get_current_ts(zone='US/Pacific'):
    return datetime.datetime.now(pytz.timezone(zone)).strftime(
        '%Y-%m-%dT%H-%M-%S.%f')


def sorted_nicely(l, reverse=False):
    def tryint(s):
        try:
            return int(s)
        except:
            return s

    def alphanum_key(s):
        if type(s) is not str:
            raise ValueError('{} must be a string in l: {}'.format(s, l))
        return [tryint(c) for c in re.split('([0-9]+)', s)]

    rtn = sorted(l, key=alphanum_key)
    if reverse:
        rtn = reversed(rtn)
    return rtn

In [3]:
!pip install torch-scatter torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html

Looking in links: https://data.pyg.org/whl/torch-1.12.0+cu113.html


In [4]:
# dataset.py
import numpy as np
import random
import torch
from torch_geometric.data import Data as PyGSingleGraphData
import scipy.sparse as sp

def to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {key: to_device(value, device) for key, value in data.items()}
    elif isinstance(data, list):
        return [to_device(x, device) for x in data]
    else:
        return data


class TextDataset(object):
    def __init__(self, name, sparse_graph, labels, vocab, word_id_map, docs_dict, loaded_dict, tvt='all',
                 train_test_split=None):
        if loaded_dict is not None:  # restore from content loaded from disk
            self.__dict__ = loaded_dict
            return
        self.name = name
        self.graph = sparse_graph
        self.labels = labels
        if 'twitter_asian_prejudice' in name:
            if 'sentiment' not in name:
                self.labels = ['discussion_of_eastasian_prejudice' if label =='counter_speech' else label for label in self.labels]
            else:
                if 'neutral' not in labels:
                    sentiment_labels = []
                    neutral_pos_labels = ["none_of_the_above", "counter_speech", "discussion_of_eastasian_prejudice"]
                    for label in labels:
                        if label in neutral_pos_labels:
                            sentiment_labels.append("neutral")
                        else:
                            sentiment_labels.append("negative")
                    self.labels = sentiment_labels
        self.label_dict = {label: i for i, label in enumerate(list(set(self.labels)))}
        self.label_inds = np.asarray([self.label_dict[label] for label in self.labels])
        self.vocab = vocab
        self.word_id_map = word_id_map
        self.docs = docs_dict
        self.node_ids = list(self.docs.keys())
        self.tvt = tvt
        self.train_test_split = train_test_split

    def tvt_split(self, split_points, tvt_list, seed):
        if self.train_test_split is None:
            doc_id_chunks = self._chunk_doc_ids(split_points, seed)
        else:
            train_ids = []
            test_ids = []
            for k, v in self.train_test_split.items():
                if v == 'test':
                    test_ids.append(k)
                elif v == 'train':
                    train_ids.append(k)
                else:
                    raise ValueError
            num_val = int(len(train_ids) * 0.1)
            random.Random(seed).shuffle(train_ids)
            val_ids = train_ids[:num_val]
            train_ids = train_ids[num_val:]
            doc_id_chunks = [train_ids, val_ids, test_ids]
        sub_dataset = []
        for i, chunk in enumerate(doc_id_chunks):
            docs = {doc_id: self.docs[doc_id] for doc_id in chunk}
            sub_dataset.append(TextDataset(self.name, self.graph, self.labels, self.vocab,
                                           self.word_id_map, docs, None, tvt_list[i]))
        return sub_dataset

    def _chunk_doc_ids(self, split_points, seed):
        ids = sorted(self.docs.keys())
        id_chunks = self._chunk_list(ids, split_points, seed)
        return id_chunks

    def _chunk_list(self, li, split_points, seed):
        rtn = []
        random.Random(seed).shuffle(li)
        left = 0
        split_indices = [int(len(li) * sp) for sp in split_points]
        for si in split_indices:
            right = left + si
            if type(right) is not int or right <= 0 or right >= len(li):
                raise ValueError('Wrong split_points {}'.format(split_points))
            take = li[left:right]
            rtn.append(take)
            left = right
        # The last chunk is inferred.
        rtn.append(li[left:])
        return rtn

    def init_node_feats(self, type, device):
        if type == 'one_hot_init':
            num_nodes = self.graph.shape[0]
            identity = sp.identity(num_nodes)
            ind0, ind1, values = sp.find(identity)
            inds = np.stack((ind0, ind1), axis=0)
            self.node_feats = torch.sparse_coo_tensor(inds, values, device=device,
                                                      dtype=torch.float)
        else:
            raise NotImplementedError


    def get_pyg_graph(self, device):
        if not hasattr(self, "pyg_graph"):
            adj = self.graph  # Your scipy.sparse.csr_matrix
            adj_coo = adj.tocoo()  # Convert to COO format, which is simple to work with

            # Create edge index and edge weight tensors directly from the COO format
            edge_index = torch.tensor([adj_coo.row, adj_coo.col], dtype=torch.long).to(device)
            edge_weight = torch.tensor(adj_coo.data, dtype=torch.float).to(device)

            # Ensure node features are in a suitable format and on the correct device
            if hasattr(self, 'node_feats'):
                if isinstance(self.node_feats, np.ndarray):
                    node_feats = torch.from_numpy(self.node_feats).float().to(device)
                else:
                    node_feats = self.node_feats.to(device)
            else:
                node_feats = torch.ones((adj.shape[0], 1), dtype=torch.float).to(device)

            # Create PyG graph data object
            self.pyg_graph = PyGSingleGraphData(x=node_feats, edge_index=edge_index, edge_attr=edge_weight, y=None)
        return self.pyg_graph

In [5]:
# build_graph.py
import networkx as nx
import scipy.sparse as sp
from math import log
from collections import defaultdict
import pandas as pd
from os.path import join, exists
from tqdm import tqdm


def build_text_graph_dataset(dataset, window_size):
    if "small" in dataset or "presplit" in dataset or 'sentiment' in dataset:
        dataset_name = "_".join(dataset.split("_")[:-1])
    else:
        dataset_name = dataset
    clean_text_path = join('/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/CPSC_577_FP', dataset_name + '_sentences_clean.txt')
    labels_path = join('/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/CPSC_577_FP', dataset_name + '_labels.txt')
    labels = pd.read_csv(labels_path, header=None, sep='\t')
    doc_list = []
    f = open(clean_text_path, 'rb')
    for line in f.readlines():
        doc_list.append(line.strip().decode())
    f.close()
    assert len(labels) == len(doc_list)
    if 'presplit' not in dataset:
        labels_list = labels.iloc[0:, 0].tolist()
        split_dict = None
    else:
        labels_list = labels.iloc[0:, 2].tolist()
        split = labels.iloc[0:, 1].tolist()
        split_dict = {}
        for i, v in enumerate(split):
            split_dict[i] = v
    if "small" in dataset:
        doc_list = doc_list[:200]
        labels_list = labels_list[:200]

    word_freq = get_vocab(doc_list)
    vocab = list(word_freq.keys())
    if not exists(join('/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/CPSC_577_FP', dataset + '_vocab.txt')):
        vocab_str = '\n'.join(vocab)
        f = open(join('/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/CPSC_577_FP', dataset + '_vocab.txt'), 'w')
        f.write(vocab_str)
        f.close()
    words_in_docs, word_doc_freq = build_word_doc_edges(doc_list)
    word_id_map = {word: i for i, word in enumerate(vocab)}

    sparse_graph = build_edges(doc_list, word_id_map, vocab, word_doc_freq, window_size)
    docs_dict = {i: doc for i, doc in enumerate(doc_list)}
    return TextDataset(dataset, sparse_graph, labels_list, vocab, word_id_map, docs_dict, None,
                       train_test_split=split_dict)


def build_edges(doc_list, word_id_map, vocab, word_doc_freq, window_size=20):
    # constructing all windows
    windows = []
    for doc_words in doc_list:
        words = doc_words.split()
        doc_length = len(words)
        if doc_length <= window_size:
            windows.append(words)
        else:
            for i in range(doc_length - window_size + 1):
                window = words[i: i + window_size]
                windows.append(window)
    # constructing all single word frequency
    word_window_freq = defaultdict(int)
    for window in windows:
        appeared = set()
        for word in window:
            if word not in appeared:
                word_window_freq[word] += 1
                appeared.add(word)
    # constructing word pair count frequency
    word_pair_count = defaultdict(int)
    for window in tqdm(windows):
        for i in range(1, len(window)):
            for j in range(i):
                word_i = window[i]
                word_j = window[j]
                word_i_id = word_id_map[word_i]
                word_j_id = word_id_map[word_j]
                if word_i_id == word_j_id:
                    continue
                word_pair_count[(word_i_id, word_j_id)] += 1
                word_pair_count[(word_j_id, word_i_id)] += 1
    row = []
    col = []
    weight = []

    # pmi as weights
    num_docs = len(doc_list)
    num_window = len(windows)
    for word_id_pair, count in tqdm(word_pair_count.items()):
        i, j = word_id_pair[0], word_id_pair[1]
        word_freq_i = word_window_freq[vocab[i]]
        word_freq_j = word_window_freq[vocab[j]]
        pmi = log((1.0 * count / num_window) /
                  (1.0 * word_freq_i * word_freq_j / (num_window * num_window)))
        if pmi <= 0:
            continue
        row.append(num_docs + i)
        col.append(num_docs + j)
        weight.append(pmi)

    # frequency of document word pair
    doc_word_freq = defaultdict(int)
    for i, doc_words in enumerate(doc_list):
        words = doc_words.split()
        for word in words:
            word_id = word_id_map[word]
            doc_word_str = (i, word_id)
            doc_word_freq[doc_word_str] += 1

    for i, doc_words in enumerate(doc_list):
        words = doc_words.split()
        doc_word_set = set()
        for word in words:
            if word in doc_word_set:
                continue
            word_id = word_id_map[word]
            freq = doc_word_freq[(i, word_id)]
            row.append(i)
            col.append(num_docs + word_id)
            idf = log(1.0 * num_docs /
                      word_doc_freq[vocab[word_id]])
            weight.append(freq * idf)
            doc_word_set.add(word)

    # Adding self-loops to the graph
    for i in range(num_docs + len(vocab)):
        row.append(i)
        col.append(i)
        weight.append(1)  # Assigning a weight of 1 to self-loops

    number_nodes = num_docs + len(vocab)
    adj_mat = sp.csr_matrix((weight, (row, col)), shape=(number_nodes, number_nodes))
    adj = adj_mat + adj_mat.T.multiply(adj_mat.T > adj_mat) - adj_mat.multiply(adj_mat.T > adj_mat)
    return adj


def get_vocab(text_list):
    word_freq = defaultdict(int)
    for doc_words in text_list:
        words = doc_words.split()
        for word in words:
            word_freq[word] += 1
    return word_freq


def build_word_doc_edges(doc_list):
    # build all docs that a word is contained in
    words_in_docs = defaultdict(set)
    for i, doc_words in enumerate(doc_list):
        words = doc_words.split()
        for word in words:
            words_in_docs[word].add(i)

    word_doc_freq = {}
    for word, doc_list in words_in_docs.items():
        word_doc_freq[word] = len(doc_list)

    return words_in_docs, word_doc_freq


In [6]:
# prep_data.py
from collections import defaultdict
from nltk.corpus import stopwords
import nltk
from os.path import join, exists
import re


def clean_data(dataset):
    clean_text_path = join('/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/CPSC_577_FP', 'corpus', dataset + '_sentences_clean.txt')
    if not exists(clean_text_path):
        docs_list = []
        old_name = dataset
        if "no_hashtag" in dataset:
            dataset = '_'.join(dataset.split('_')[:-2])
        with open(join('/gpfs/gibbs/project/cpsc477/cpsc477_sw2349/content/MyDrive/CPSC_577_FP', 'corpus', dataset + '_sentences.txt')) as f:
            for line in f.readlines():
                docs_list.append(line.strip())
        dataset = old_name
        word_counts = defaultdict(int)
        for doc in docs_list:
            temp = clean_doc(doc, dataset)
            words = temp.split()
            for word in words:
                word_counts[word] += 1
        clean_docs = clean_documents(docs_list, word_counts, dataset)
        corpus_str = '\n'.join(clean_docs)
        f = open(clean_text_path, 'w')
        f.write(corpus_str)
        f.close()
    f = open(clean_text_path, 'r')
    lines = f.readlines()
    min_len = 10000
    aver_len = 0
    max_len = 0
    for line in lines:
        line = line.strip()
        temp = line.split()
        aver_len = aver_len + len(temp)
        if len(temp) < min_len:
            min_len = len(temp)
        if len(temp) > max_len:
            max_len = len(temp)
    f.close()
    aver_len = 1.0 * aver_len / len(lines)
    print('min_len : ' + str(min_len))
    print('max_len : ' + str(max_len))
    print('average_len : ' + str(aver_len))


def clean_documents(docs, word_counts, dataset):
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    ret = []
    for doc in docs:
        doc = clean_doc(doc, dataset)
        words = doc.split()
        words = [word for word in words if word not in stop_words and word_counts[word] >= 5]
        doc = ' '.join(words).strip()
        if doc != '':
            ret.append(' '.join(words).strip())
        else:
            ret.append(' ')
    return ret


def clean_doc_ap(string):
    string = re.sub(r"http[s]?\:\/\/.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+", " ", string)
    string = re.sub(r"[^A-Za-z0-9()_+,!?:\'\`]", " ", string)  # replace all non alpha numeric characters
    string = re.sub(r"(?<!HASHTAG)_", " ", string)
    string = re.sub(r"(?<!EASTASIA)\+ | (?<!VIRUS)\+", " ", string)
    string = re.sub(r"\+", "_", string)
    string = re.sub(r"HASHTAG_EASTASIA_VIRUS(?!(\s))", "HASHTAG_EASTASIA_VIRUS ", string)
    string = re.sub(r"HASHTAG_EASTASIA(?!(\s|_))", "HASHTAG_EASTASIA ", string)
    string = re.sub(r"HASHTAG_VIRUS(?!(\s|_))", "HASHTAG_VIRUS ", string)
    string = re.sub(r"HASHTAG_VIRUS_OTHERCOUNTRY(?!(\s))", "HASHTAG_VIRUS_OTHERCOUNTRY ", string)
    string = re.sub(r"HASHTAG(?!([\s|_]))", "HASHTAG ", string)
    if "no_hashtag" in dataset:
        string = re.sub(r"HASHTAG_EASTASIA_VIRUS", " ", string)
        string = re.sub(r"HASHTAG_EASTASIA", " ", string)
        string = re.sub(r"HASHTAG_VIRUS", " ", string)
        string = re.sub(r"HASHTAG_VIRUS_OTHERCOUNTRY", " ", string)
        string = re.sub(r"HASHTAG", " ", string)
    return string


def clean_doc(string, dataset):
    if 'twitter_asian_prejudice' in dataset:
        string = clean_doc_ap(string)
    else:
        pass
    string = re.sub(r"^\"", "", string)
    string = re.sub(r"\"$", "", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"\.", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [7]:
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.nn import Parameter
from torch_scatter import scatter_add
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops, softmax
from torch_geometric.nn.inits import glorot, zeros
#from torch_geometric.nn import SAGEConv

class CustomSAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels, normalize=True, bias=True):
        super(CustomSAGEConv, self).__init__(aggr=None)  # Initialize without predefined aggregation
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.normalize = normalize
        
        self.lin = nn.Linear(in_channels, out_channels, bias=False)
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_channels))
            torch.nn.init.uniform_(self.bias, -0.01, 0.01)
        else:
            self.register_parameter('bias', None)

    def forward(self, x, edge_index, edge_weight=None):
        # Add self-loops to the adjacency matrix
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

        # Create the adjacency matrix from edge_index for dense operations
        num_nodes = x.size(0)
        adj = torch.zeros((num_nodes, num_nodes), device=x.device)
        adj[edge_index[0], edge_index[1]] = 1  # Assuming unweighted for simplicity, use edge_weight if available

        # Normalize adjacency matrix
        if self.normalize:
            row_sum = adj.sum(dim=1, keepdim=True)
            d_inv_sqrt = row_sum.pow(-0.5)
            d_inv_sqrt[torch.isinf(d_inv_sqrt)] = 0
            adj = d_inv_sqrt * adj * d_inv_sqrt.transpose(0, 1)

        # Dense matrix multiplication for aggregation
        aggregated_features = torch.matmul(adj, x)

        # Apply linear transformation
        transformed_features = self.lin(aggregated_features)

        if self.bias is not None:
            transformed_features += self.bias

        return transformed_features


class TextGNN(nn.Module):
    def __init__(self, pred_type, node_embd_type, num_layers, layer_dim_list, act, bn, num_labels, class_weights, dropout):
        super(TextGNN, self).__init__()
        self.node_embd_type = node_embd_type
        self.layer_dim_list = layer_dim_list
        self.num_layers = num_layers
        self.dropout = dropout
        if pred_type == 'softmax':
            assert layer_dim_list[-1] == num_labels
        elif pred_type == 'mlp':
            dims = self._calc_mlp_dims(layer_dim_list[-1], num_labels)
            self.mlp = MLP(layer_dim_list[-1], num_labels, num_hidden_lyr=len(dims), hidden_channels=dims, bn=False)
        self.pred_type = pred_type
        assert len(layer_dim_list) == (num_layers + 1)
        self.act = act
        self.bn = bn
        self.layers = self._create_node_embd_layers()
        self.loss = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, pyg_graph, dataset):
        acts = [pyg_graph.x]
        for i, layer in enumerate(self.layers):
            ins = acts[-1]
            outs = layer(ins, pyg_graph)
            acts.append(outs)

        return self._loss(acts[-1], dataset)

    def _loss(self, ins, dataset):
        pred_inds = dataset.node_ids
        if self.pred_type == 'softmax':
            y_preds = ins[pred_inds]
        elif self.pred_type == 'mlp':
            y_preds = self.mlp(ins[pred_inds])
        else:
            raise NotImplementedError
        y_true = torch.tensor(dataset.label_inds[pred_inds], dtype=torch.long, device=device)
        loss = self.loss(y_preds, y_true)
        return loss, y_preds.cpu().detach().numpy()

    def _create_node_embd_layers(self):
        layers = nn.ModuleList()
        for i in range(self.num_layers):
            act = self.act if i < self.num_layers - 1 else 'identity'
            layers.append(NodeEmbedding(
                type=self.node_embd_type,
                in_dim=self.layer_dim_list[i],
                out_dim=self.layer_dim_list[i + 1],
                act=act,
                bn=self.bn,
                dropout=self.dropout if i != 0 else False
            ))
        return layers

    def _calc_mlp_dims(self, mlp_dim, output_dim=1):
        dim = mlp_dim
        dims = []
        while dim > output_dim:
            dim = dim // 2
            dims.append(dim)
        dims = dims[:-1]
        return dims


class NodeEmbedding(nn.Module):
    def __init__(self, type, in_dim, out_dim, act, bn, dropout):
        super(NodeEmbedding, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.type = type
        if type == 'gcn':
            self.conv = GCNConv(in_dim, out_dim)
            self.act = create_act(act, out_dim)
        elif type == 'gat':
            self.conv = GATConv(in_dim, out_dim)
            self.act = create_act(act, out_dim)
        elif type == "graphsage" :
            self.conv = CustomSAGEConv(in_dim, out_dim)
            self.act = create_act(act, out_dim)
        else:
            raise ValueError(
                'Unknown node embedding layer type {}'.format(type))
        self.bn = bn
        if self.bn:
            self.bn = torch.nn.BatchNorm1d(out_dim)
        self.dropout = dropout
        if dropout:
            self.dropout = torch.nn.Dropout()

    def forward(self, ins, pyg_graph):
        if self.dropout:
            ins = self.dropout(ins)

        if self.type == 'gcn':
            if use_edge_weights:
                x = self.conv(ins, pyg_graph.edge_index, edge_weight=pyg_graph.edge_attr)
            else:
                if ins.is_sparse:
                  ins = ins.to_dense()
                if pyg_graph.edge_index.is_sparse:
                  ins = ins.to_dense()
                  # If your edge indices are in a tensor called 'edge_index'
                pyg_graph.edge_index = pyg_graph.edge_index.to(device)

                # Similarly for other tensors involved in computation like feature matrices
                ins = ins.to(device)
                x = self.conv(ins, pyg_graph.edge_index)
        else:
            x = self.conv(ins, pyg_graph.edge_index)
        x = self.act(x)
        return x


class MLP(nn.Module):
    '''mlp can specify number of hidden layers and hidden layer channels'''

    def __init__(self, input_dim, output_dim, activation_type='relu', num_hidden_lyr=2,
                 hidden_channels=None, bn=False):
        super().__init__()
        self.out_dim = output_dim
        if not hidden_channels:
            hidden_channels = [input_dim for _ in range(num_hidden_lyr)]
        elif len(hidden_channels) != num_hidden_lyr:
            raise ValueError(
                "number of hidden layers should be the same as the lengh of hidden_channels")
        self.layer_channels = [input_dim] + hidden_channels + [output_dim]
        self.activation = create_act(activation_type)
        self.layers = nn.ModuleList(list(
            map(self.weight_init, [nn.Linear(self.layer_channels[i], self.layer_channels[i + 1])
                                   for i in range(len(self.layer_channels) - 1)])))
        self.bn = bn
        if self.bn:
            self.bn = torch.nn.BatchNorm1d(output_dim)

    def weight_init(self, m):
        torch.nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
        return m

    def forward(self, x):
        layer_inputs = [x]
        for layer in self.layers:
            input = layer_inputs[-1]
            if layer == self.layers[-1]:
                layer_inputs.append(layer(input))
            else:
                layer_inputs.append(self.activation(layer(input)))
        # model.store_layer_output(self, layer_inputs[-1])
        if self.bn:
            layer_inputs[-1] = self.bn(layer_inputs[-1])
        return layer_inputs[-1]


def create_act(act, num_parameters=None):
    if act == 'relu':
        return nn.ReLU()
    elif act == 'prelu':
        return nn.PReLU(num_parameters)
    elif act == 'sigmoid':
        return nn.Sigmoid()
    elif act == 'tanh':
        return nn.Tanh()
    elif act == 'identity':
        class Identity(nn.Module):
            def forward(self, x):
                return x

        return Identity()
    else:
        raise ValueError('Unknown activation function {}'.format(act))


class GCNConv(MessagePassing):
    r"""The graph convolutional operator from the `"Semi-supervised
    Classfication with Graph Convolutional Networks"
    <https://arxiv.org/abs/1609.02907>`_ paper

    .. math::
        \mathbf{X}^{\prime} = \mathbf{\hat{D}}^{-1/2} \mathbf{\hat{A}}
        \mathbf{\hat{D}}^{-1/2} \mathbf{X} \mathbf{\Theta},

    where :math:`\mathbf{\hat{A}} = \mathbf{A} + \mathbf{I}` denotes the
    adjacency matrix with inserted self-loops and
    :math:`\hat{D}_{ii} = \sum_{j=0} \hat{A}_{ij}` its diagonal degree matrix.

    Args:
        in_channels (int): Size of each input sample.
        out_channels (int): Size of each output sample.
        improved (bool, optional): If set to :obj:`True`, the layer computes
            :math:`\mathbf{\hat{A}}` as :math:`\mathbf{A} + 2\mathbf{I}`.
            (default: :obj:`False`)
        cached (bool, optional): If set to :obj:`True`, the layer will cache
            the computation of :math:`{\left(\mathbf{\hat{D}}^{-1/2}
            \mathbf{\hat{A}} \mathbf{\hat{D}}^{-1/2} \right)}`.
            (default: :obj:`False`)
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 improved=False,
                 cached=False,
                 bias=True):
        super(GCNConv, self).__init__('add')

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.improved = improved
        self.cached = cached
        self.cached_result = None

        self.weight = Parameter(torch.Tensor(in_channels, out_channels))

        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        glorot(self.weight)
        zeros(self.bias)
        self.cached_result = None

    @staticmethod
    def norm(edge_index, num_nodes, edge_weight=None, improved=False, dtype=None):
        if edge_weight is None:
            edge_weight = torch.ones((edge_index.size(1), ),
                                     dtype=dtype,
                                     device=edge_index.device)
        edge_weight = edge_weight.view(-1)
        assert edge_weight.size(0) == edge_index.size(1)

        edge_index, edge_weight = remove_self_loops(edge_index, edge_weight)
        edge_index = add_self_loops(edge_index, num_nodes)
        # Calling add_self_loops with optional edge_attr:
        # edge_attr = torch.ones([edge_index.shape[1]], dtype=torch.float32)  # Assuming one attribute per edge

        # edge_index, edge_attr = add_self_loops(edge_index, edge_attr=edge_attr if edge_attr is not None else None)

        loop_weight = torch.full((num_nodes, ),
                                 1 if not improved else 2,
                                 dtype=edge_weight.dtype,
                                 device=edge_weight.device)
        edge_weight = torch.cat([edge_weight, loop_weight], dim=0)

        row, col = edge_index
        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0

        return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]

    def forward(self, x, edge_index, edge_weight=None):
        """"""
        if x.is_sparse:
            x = torch.sparse.mm(x, self.weight)
        else:
            x = torch.matmul(x, self.weight)

        if not self.cached or self.cached_result is None:
            edge_index, norm = GCNConv.norm(edge_index, x.size(0), edge_weight,
                                            self.improved, x.dtype)
            self.cached_result = edge_index, norm

        edge_index, norm = self.cached_result
        return self.propagate(edge_index, x=x, norm=norm)

    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j

    def update(self, aggr_out):
        if self.bias is not None:
            aggr_out = aggr_out + self.bias
        return aggr_out

    def __repr__(self):
        return '{}({}, {})'.format(self.__class__.__name__, self.in_channels,
                                   self.out_channels)


class GATConv(MessagePassing):
    r"""The graph attentional operator from the `"Graph Attention Networks"
    <https://arxiv.org/abs/1710.10903>`_ paper

    .. math::
        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{j} +
        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},

    where the attention coefficients :math:`\alpha_{i,j}` are computed as

    .. math::
        \alpha_{i,j} =
        \frac{
        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
        \right)\right)}
        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
        \right)\right)}.

    Args:
        in_channels (int): Size of each input sample.
        out_channels (int): Size of each output sample.
        heads (int, optional): Number of multi-head-attentions. (default:
            :obj:`1`)
        concat (bool, optional): If set to :obj:`False`, the multi-head
        attentions are averaged instead of concatenated. (default: :obj:`True`)
        negative_slope (float, optional): LeakyReLU angle of the negative
            slope. (default: :obj:`0.2`)
        dropout (float, optional): Dropout probability of the normalized
            attention coefficients which exposes each node to a stochastically
            sampled neighborhood during training. (default: :obj:`0`)
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 heads=1,
                 concat=True,
                 negative_slope=0.2,
                 dropout=0,
                 bias=True):
        super(GATConv, self).__init__('add')

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.heads = heads
        self.concat = concat
        self.negative_slope = negative_slope
        self.dropout = dropout

        self.weight = Parameter(
            torch.Tensor(in_channels, heads * out_channels))
        self.att = Parameter(torch.Tensor(1, heads, 2 * out_channels))

        if bias and concat:
            self.bias = Parameter(torch.Tensor(heads * out_channels))
        elif bias and not concat:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        glorot(self.weight)
        glorot(self.att)
        zeros(self.bias)

    def forward(self, x, edge_index):
        """"""
        edge_index, _ = remove_self_loops(edge_index)
        edge_index = add_self_loops(edge_index, num_nodes=x.size(0))
        if x.is_sparse:
            x = torch.sparse.mm(x, self.weight).view(-1, self.heads, self.out_channels)
        else:
            x = torch.matmul(x, self.weight).view(-1, self.heads, self.out_channels)
        return self.propagate(edge_index, x=x, num_nodes=x.size(0))

    def message(self, x_i, x_j, edge_index, num_nodes):
        # Compute attention coefficients.
        alpha = (torch.cat([x_i, x_j], dim=-1) * self.att).sum(dim=-1)
        alpha = F.leaky_relu(alpha, self.negative_slope)
        alpha = softmax(alpha, edge_index[0], num_nodes)

        # Sample attention coefficients stochastically.
        if self.training and self.dropout > 0:
            alpha = F.dropout(alpha, p=self.dropout, training=True)

        return x_j * alpha.view(-1, self.heads, 1)

    def update(self, aggr_out):
        if self.concat is True:
            aggr_out = aggr_out.view(-1, self.heads * self.out_channels)
        else:
            aggr_out = aggr_out.mean(dim=1)

        if self.bias is not None:
            aggr_out = aggr_out + self.bias
        return aggr_out

    def __repr__(self):
        return '{}({}, {}, heads={})'.format(self.__class__.__name__,
                                             self.in_channels,
                                             self.out_channels, self.heads)

In [8]:
from collections import Counter

def create_model(dataset):
    name = model  # Directly use the global variable 'model'
    layer_info = model_params  # Directly use the global dictionary 'model_params'
    if name in model_ctors:
        return model_ctors[name](layer_info, dataset)
    else:
        raise ValueError("Model not implemented {}".format(name))



def create_text_gnn(layer_info, dataset):

    lyr_dims = layer_info["layer_dims"]
    lyr_dims = [dataset.node_feats.shape[1]] + lyr_dims
    weights = None
    if layer_info["class_weights"] == True:
        counts = Counter(dataset.label_inds[dataset.node_ids])
        weights = len(counts) * [0]
        min_weight = min(counts.values())
        for k, v in counts.items():
            weights[k] = min_weight / float(v)
        weights = torch.tensor(weights, device=device)

    return TextGNN(
        pred_type=layer_info["pred_type"],
        node_embd_type=layer_info["node_embd"],
        num_layers=int(layer_info["num_layers"]),
        layer_dim_list=lyr_dims,
        act=layer_info["act"],
        bn=False,
        num_labels=len(dataset.label_dict),
        class_weights=weights,
        dropout=layer_info["dropout"]
    )


model_ctors = {
    'TextGNN': create_text_gnn,
}

In [9]:
import numpy as np
from sklearn import metrics


def eval(preds, dataset, test=False):
    y_true = dataset.label_inds[dataset.node_ids]
    y_pred_label = np.asarray([np.argmax(pred) for pred in preds])
    accuracy = metrics.accuracy_score(y_true, y_pred_label)
    f1_weighted = metrics.f1_score(y_true, y_pred_label, average='weighted')
    f1_macro = metrics.f1_score(y_true, y_pred_label, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred_label, average='micro')
    precision_weighted = metrics.precision_score(y_true, y_pred_label, average='weighted')
    precision_macro = metrics.precision_score(y_true, y_pred_label, average='macro')
    precision_micro = metrics.precision_score(y_true, y_pred_label, average='micro')
    recall_weighted = metrics.recall_score(y_true, y_pred_label, average='weighted')
    recall_macro = metrics.recall_score(y_true, y_pred_label, average='macro')
    recall_micro = metrics.recall_score(y_true, y_pred_label, average='micro')
    results = {"accuracy": accuracy,
               "f1_weighted": f1_weighted,
               "f1_macro": f1_macro,
               "f1_micro": f1_micro,
               "precision_weighted": precision_weighted,
               "precision_macro": precision_macro,
               "precision_micro": precision_micro,
               "recall_weighted": recall_weighted,
               "recall_macro": recall_macro,
               "recall_micro": recall_micro
               }
    if test:
        one_hot_true = np.zeros((y_true.size, len(dataset.label_dict)))
        one_hot_true[np.arange(y_true.size), y_true] = 1
        results["y_true"] = one_hot_true
        one_hot_pred = np.zeros((y_true.size, len(dataset.label_dict)))
        one_hot_pred[np.arange(y_pred_label.size),y_pred_label] = 1
        results["y_pred"] = one_hot_pred
    return results


class MovingAverage(object):
    def __init__(self, window, want_increase=True):
        self.moving_avg = [float('-inf')] if want_increase else [float('inf')]
        self.want_increase = want_increase
        self.results = []
        self.window = window

    def add_to_moving_avg(self, x):
        self.results.append(x)
        if len(self.results) >= self.window:
            next_val = sum(self.results[-self.window:]) / self.window
            self.moving_avg.append(next_val)

    def best_result(self, x):
        if self.want_increase:
            return (x - 1e-7) > max(self.results)
        else:
            return (x + 1e-7) < min(self.results)

    def stop(self):
        if len(self.moving_avg) < 2:
            return False
        if self.want_increase:
            return (self.moving_avg[-1] + 1e-7) < self.moving_avg[-2]
        else:
            return (self.moving_avg[-2] + 1e-7) < self.moving_avg[-1]

In [10]:
import glob
from os.path import join, getctime
import torch

class Saver(object):
    def __init__(self):
        global logs
        model_str = self.get_model_str()
        self.logdir = join(
            '/home/cpsc477_sw2349/project/content/MyDrive/CPSC_577_FP/logs',  # 使用 Colab 的默认工作目录下的 logs 目录
            '{}_{}'.format(model_str, get_ts()))
        create_dir_if_not_exists(self.logdir)
        self.model_info_f = self._open('model_info.txt')
        if logs:
            print('Logging to {}'.format(self.logdir))

    def save_trained_model(self, trained_model, epoch=None):
        epoch = "_epoch_{}".format(epoch) if epoch is not None else ""
        p = join(self.logdir, 'trained_model{}.pt'.format(epoch))
        torch.save(trained_model.state_dict(), p)
        if logs:
            print('Trained model saved to {}'.format(p))

    def load_trained_model(self, train_data):
        p = join(self.logdir, 'trained_model*')
        files = glob.glob(p)
        best_trained_model_path = max(files, key=getctime)
        trained_model = create_model(train_data)
        trained_model.load_state_dict(
            torch.load(best_trained_model_path, map_location=device))
        trained_model.to(device)
        return trained_model

    def get_model_str(self):
        li = []
        key_flags = [model, dataset, "_".join([str(i) for i in tvt_ratio])]
        for f in key_flags:
            li.append(str(f))
        return '_'.join(li)

    def _open(self, f):
        return open(join(self.logdir, f), 'w')


In [11]:
import gc


def load_data():
    global dataset
    dir = join(get_save_path(), 'split')
    dataset_name = dataset
    train_ratio = int(tvt_ratio[0] * 100)
    val_ratio = int(tvt_ratio[1] * 100)
    test_ratio = 100 - train_ratio - val_ratio
    if 'presplit' not in dataset_name:
        save_fn = '{}_train_{}_val_{}_test_{}_seed_{}_window_size_{}_SAGE'.format(dataset_name, train_ratio,
                                                              val_ratio, test_ratio,
                                                              random_seed, word_window_size)
    else:
        save_fn = '{}_train_val_test_{}_window_size_{}_SAGE'.format(dataset_name, random_seed, word_window_size)
    path = join(dir, save_fn)
    rtn = load(path)
    if rtn:
        train_data, val_data, test_data = rtn['train_data'], rtn['val_data'], rtn['test_data']
    else:
        train_data, val_data, test_data = _load_tvt_data_helper()
        save({'train_data': train_data, 'val_data': val_data, 'test_data': test_data}, path)
    dataset = dataset
    if "small" in dataset or "presplit" in dataset or 'sentiment' in dataset:
        dataset_name = "_".join(dataset.split("_")[:-1])
    else:
        dataset_name = dataset

    orig_text_path = join(get_corpus_path(), dataset_name + "_sentences.txt")
    orig_text_path = ensure_prefix(orig_text_path)
    raw_doc_list = []
    f = open(orig_text_path, 'rb')
    for line in f.readlines():
        raw_doc_list.append(line.strip().decode())
    f.close()

    return train_data, val_data, test_data, raw_doc_list


def _load_tvt_data_helper():
    global dataset
    dir = join(get_save_path(), 'all')
    path = join(dir, dataset + '_all_window_' + str(word_window_size))
    rtn = load(path)
    if rtn:
        dataset = TextDataset(None, None, None, None, None, None, rtn)
    else:
        dataset = build_text_graph_dataset(dataset, word_window_size)
        gc.collect()
        save(dataset.__dict__, path)

    train_dataset, val_dataset, test_dataset = dataset.tvt_split(tvt_ratio[:2], tvt_list, random_seed)
    return train_dataset, val_dataset, test_dataset

In [12]:
from pprint import pprint
import time


def train(train_data, val_data, saver, logs = True):
    train_data.init_node_feats(init_type, device)
    val_data.init_node_feats(init_type, device)
    model = create_model(train_data)
    model = model.to(device)
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("Number params: ", pytorch_total_params)
    moving_avg = MovingAverage(validation_window_size, validation_metric != 'loss')
    pyg_graph = train_data.get_pyg_graph(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, )

    for epoch in range(num_epochs):
        t = time.time()
        model.train()
        model.zero_grad()
        loss, preds_train = model(pyg_graph, train_data)
        loss.backward()
        optimizer.step()
        loss = loss.item()
        with torch.no_grad():
            val_loss, preds_val = model(pyg_graph, val_data)
            val_loss = val_loss.item()
            eval_res_val = eval(preds_val, val_data)
            if logs:
                print("Epoch: {:04d}, Train Loss: {:.5f}, Time: {:.5f}".format(epoch, loss, time.time() - t))
                print("Val Loss: {:.5f}".format(val_loss))
                print("Val Results: ...")
                pprint(eval_res_val)
            eval_res_val["loss"] = val_loss

            if len(moving_avg.results) == 0 or moving_avg.best_result(eval_res_val[validation_metric]):
                saver.save_trained_model(model, epoch + 1)
            moving_avg.add_to_moving_avg(eval_res_val[validation_metric])
            if moving_avg.stop():
                break
    best_model = saver.load_trained_model(train_data)
    return best_model, model

## Hyperparameter Tuning R8 Dataset

We perform hyperparameter tuning by grid search. 
Possible hyperparameters to tune are :
- pred_type
- Learning rate
- activation function

In [13]:
import itertools
import torch
from collections import defaultdict
import warnings

debug = False
gpu = -1
use_comet_ml = False

random_seed = 123
dataset = 'r8_presplit'

if 'twitter_asian_prejudice' in dataset:
    if 'sentiment' in dataset:
        num_labels = 2
    else:
        num_labels = 4
elif 'r8' in dataset:
    num_labels = 8

# Define the parameter grid
param_grid = {
    'lr': [1e-2, 1e-3, 1e-4],
    'pred_type': ['softmax', 'mlp'],
    'dropout': [True, False],
    'act' : ['relu', 'prelu', 'sigmoid', 'tanh']
}


"""
Sampling
"""
word_window_size = 10
validation_window_size = 10

"""
Validation
"""
validation_metric = "accuracy"  # Alternatively, "f1_weighted" or "loss"

use_best_val_model_for_inference = True

"""
Evaluation.
"""
tvt_ratio = [0.8, 0.1, 0.1]
tvt_list = ["train", "test", "val"]
model = "TextGNN"
device = 'cuda'#.format(gpu) if torch.cuda.is_available() and gpu != -1 else 'cpu'
    
num_epochs = 2 if debug else 400
lr = 2e-2
use_edge_weights = False
init_type = 'one_hot_init'
logs = False
model_params = {}
# Function to create a model with specified hyperparameters
def run_experiment(Learning_Rate, Pred_Type, Dropout, Act):
    warnings.filterwarnings('ignore')
    global lr, model, num_labels, dataset, use_edge_weights, init_type, model_params
    lr = Learning_Rate

    if model == 'TextGNN':
        pred_type = Pred_Type
        node_embd_type = 'graphsage'
        layer_dim_list = [200, num_labels]
        num_layers = len(layer_dim_list)
        class_weights = True
        dropout = Dropout
        s = 'TextGraphSAGE:pred_type={},node_embd_type={},num_layers={},layer_dim_list={},act={},' \
            'dropout={},class_weights={}'.format(
            pred_type, node_embd_type, num_layers, "_".join([str(i) for i in layer_dim_list]), Act, dropout, class_weights
        )
        model_params = {
            'pred_type': pred_type,
            'node_embd':  node_embd_type,
            'num_layers': num_layers,
            'layer_dims': layer_dim_list,
            'act': Act,
            'class_weights': class_weights,
            'dropout': dropout
        }
    else:
        raise NotImplementedError
    
    print("{}: {}\n".format(model, model_params))
    saver = Saver()
    train_data, val_data, test_data, raw_doc_list = load_data()
    
    saved_model, model = train(train_data, val_data, saver, False)
    with torch.no_grad():
        test_loss_model, preds_model = model(train_data.get_pyg_graph(device=device), test_data)

    eval_res = eval(preds_model, test_data, True)
    y_true = eval_res.pop('y_true')
    y_pred = eval_res.pop('y_pred')
    acc = eval_res['accuracy']
    model = "TextGNN"
    return acc

# Perform grid search
best_acc = 0
best_params = None
results = defaultdict(list)

for params in itertools.product(*param_grid.values()):
    Learning_Rate, Pred_Type, Dropout, Act = params
    print(f"Testing with lr={Learning_Rate}, pred_type={Pred_Type}, act={Act}, dropout={Dropout}")
    
    acc = run_experiment(Learning_Rate, Pred_Type, Dropout, Act)
    results[(Learning_Rate, Pred_Type, Dropout, Act)].append(acc)
    
    # Track the best parameters
    if acc > best_acc:
        best_acc = acc
        best_params = (Learning_Rate, Pred_Type, Dropout, Act)
        print(f"New best model with accuracy {best_acc} and params {best_params}")

# Print best found parameters
print("Best Parameters:", best_params)


Testing with lr=0.01, pred_type=softmax, act=relu, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 8], 'act': 'relu', 'class_weights': True, 'dropout': True}

Number params:  3074208
New best model with accuracy 0.944266788487894 and params (0.01, 'softmax', True, 'relu')
Testing with lr=0.01, pred_type=softmax, act=prelu, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 8], 'act': 'prelu', 'class_weights': True, 'dropout': True}

Number params:  3074408
Testing with lr=0.01, pred_type=softmax, act=sigmoid, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 8], 'act': 'sigmoid', 'class_weights': True, 'dropout': True}

Number params:  3074208
Testing with lr=0.01, pred_type=softmax, act=tanh, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 8], 'a

## Actual training and testing with best parameters

In [36]:
# config
"""
Most Relevant
"""

debug = False
gpu = -1
use_comet_ml = False
logs = False

"""
dataset:
 sentiment suffix for twitter means the negative classes of the original dataset are combined and the other classes are combined for sentiment analysis
 presplit suffix means training and test are predetermined in [dataset]_labels.txt
 small suffix means a very small dataset used for debugging
"""
random_seed = 123
dataset = 'r8_presplit'
# dataset = 'ag_presplit'

if 'ag' in dataset:
    num_labels = 4
elif 'r8' in dataset:
    num_labels = 8

"""
Model. Pt1
"""

model = "TextGNN"

model_params = {}
use_edge_weights = False
init_type = 'one_hot_init'
if model == 'TextGNN':
    pred_type = 'softmax'
    node_embd_type = 'graphsage'
    layer_dim_list = [200, num_labels]
    num_layers = len(layer_dim_list)
    class_weights = True
    dropout = False
    s = 'TextGraphSAGE:pred_type={},node_embd_type={},num_layers={},layer_dim_list={},act={},' \
        'dropout={},class_weights={}'.format(
        pred_type, node_embd_type, num_layers, "_".join([str(i) for i in layer_dim_list]), 'relu', dropout, class_weights
    )
    model_params = {
        'pred_type': pred_type,
        'node_embd':  node_embd_type,
        'num_layers': num_layers,
        'layer_dims': layer_dim_list,
        'act': 'prelu',
        'class_weights': class_weights,
        'dropout': dropout
    }
else:
    raise NotImplementedError

print("{}: {}\n".format(model, model_params))

"""
Sampling
"""
word_window_size = 10
validation_window_size = 10

"""
Validation
"""
validation_metric = "accuracy"  # Alternatively, "f1_weighted" or "loss"

use_best_val_model_for_inference = True

"""
Evaluation.
"""
tvt_ratio = [0.8, 0.1, 0.1]
tvt_list = ["train", "test", "val"]

"""
Optimization.
"""

lr = 0.01

device = 'cuda'#.format(gpu) if torch.cuda.is_available() and gpu != -1 else 'cpu'

num_epochs = 2 if debug else 400

"""
Other info.
"""
# Assuming get_user() and get_host() are function calls that need to be defined or imported
user = get_user()
hostname = get_host()

TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 8], 'act': 'prelu', 'class_weights': True, 'dropout': False}



In [37]:
dataset = 'r8_presplit'
num_experiments = 5
random_seeds = [33, 15, 86, 109, 78]
model = "TextGNN"
all_experiment_results = []
for exp in range(num_experiments):
    random_seed = random_seeds[exp]
    saver = Saver()
    train_data, val_data, test_data, raw_doc_list = load_data()
    
    saved_model, model = train(train_data, val_data, saver, False)
    with torch.no_grad():
        test_loss_model, preds_model = model(train_data.get_pyg_graph(device=device), test_data)
    eval_res = eval(preds_model, test_data, True)
    y_true = eval_res.pop('y_true')
    y_pred = eval_res.pop('y_pred')
    print("Test...experiment ", exp+1)
    pprint(eval_res)
    all_experiment_results.append(eval_res)
    model = "TextGNN"
    
# Calculate mean and standard deviation across experiments
final_metrics = {key: [] for key in all_experiment_results[0]}
for results in all_experiment_results:
    for key in results:
        final_metrics[key].append(results[key])

for metric in final_metrics:
    values = np.array(final_metrics[metric])
    mean = values.mean()
    std = values.std()
    print(f'{metric}: Mean={mean:.6f}, Std={std:.6f}')

Number params:  3074408
Test...experiment  1
{'accuracy': 0.9643672910004568,
 'f1_macro': 0.9038464848725574,
 'f1_micro': 0.9643672910004568,
 'f1_weighted': 0.9645816313775437,
 'precision_macro': 0.8904182462664343,
 'precision_micro': 0.9643672910004568,
 'precision_weighted': 0.9654787119766619,
 'recall_macro': 0.9232344870902105,
 'recall_micro': 0.9643672910004568,
 'recall_weighted': 0.9643672910004568}
Number params:  3074408
Test...experiment  2
{'accuracy': 0.9597989949748744,
 'f1_macro': 0.8904713473916541,
 'f1_micro': 0.9597989949748744,
 'f1_weighted': 0.9602068516576928,
 'precision_macro': 0.8693514559060807,
 'precision_micro': 0.9597989949748744,
 'precision_weighted': 0.9612201274827927,
 'recall_macro': 0.9185654086724129,
 'recall_micro': 0.9597989949748744,
 'recall_weighted': 0.9597989949748744}
Number params:  3074408
Test...experiment  3
{'accuracy': 0.9625399725902238,
 'f1_macro': 0.9013959170621434,
 'f1_micro': 0.9625399725902238,
 'f1_weighted': 0.9628

## Hyperparameter Tuning Twitter Dataset

We perform hyperparameter tuning by grid search. 
Possible hyperparameters to tune are :
- pred_type
- Learning rate
- activation function

In [30]:
import itertools
import torch
from collections import defaultdict
import warnings

debug = False
gpu = -1
use_comet_ml = False

random_seed = 123
dataset = 'twitter_asian_prejudice'

if 'twitter_asian_prejudice' in dataset:
    if 'sentiment' in dataset:
        num_labels = 2
    else:
        num_labels = 4
elif 'r8' in dataset:
    num_labels = 8

# Define the parameter grid
param_grid = {
    'lr': [1e-2, 1e-3, 1e-4],
    'pred_type': ['softmax', 'mlp'],
    'dropout': [True, False],
    'act' : ['relu', 'prelu', 'sigmoid', 'tanh']
}


"""
Sampling
"""
word_window_size = 10
validation_window_size = 10

"""
Validation
"""
validation_metric = "accuracy"  # Alternatively, "f1_weighted" or "loss"

use_best_val_model_for_inference = True

"""
Evaluation.
"""
tvt_ratio = [0.8, 0.1, 0.1]
tvt_list = ["train", "test", "val"]
model = "TextGNN"
device = 'cuda'#.format(gpu) if torch.cuda.is_available() and gpu != -1 else 'cpu'
    
num_epochs = 2 if debug else 400
lr = 2e-2
use_edge_weights = False
init_type = 'one_hot_init'
logs = False
model_params = {}
# Function to create a model with specified hyperparameters
def run_experiment(Learning_Rate, Pred_Type, Dropout, Act):
    warnings.filterwarnings('ignore')
    global lr, model, num_labels, dataset, use_edge_weights, init_type, model_params
    lr = Learning_Rate

    if model == 'TextGNN':
        pred_type = Pred_Type
        node_embd_type = 'graphsage'
        layer_dim_list = [200, num_labels]
        num_layers = len(layer_dim_list)
        class_weights = True
        dropout = Dropout
        s = 'TextGraphSAGE:pred_type={},node_embd_type={},num_layers={},layer_dim_list={},act={},' \
            'dropout={},class_weights={}'.format(
            pred_type, node_embd_type, num_layers, "_".join([str(i) for i in layer_dim_list]), Act, dropout, class_weights
        )
        model_params = {
            'pred_type': pred_type,
            'node_embd':  node_embd_type,
            'num_layers': num_layers,
            'layer_dims': layer_dim_list,
            'act': Act,
            'class_weights': class_weights,
            'dropout': dropout
        }
    else:
        raise NotImplementedError
    
    print("{}: {}\n".format(model, model_params))
    saver = Saver()
    train_data, val_data, test_data, raw_doc_list = load_data()
    
    saved_model, model = train(train_data, val_data, saver, False)
    with torch.no_grad():
        test_loss_model, preds_model = model(train_data.get_pyg_graph(device=device), test_data)

    eval_res = eval(preds_model, test_data, True)
    y_true = eval_res.pop('y_true')
    y_pred = eval_res.pop('y_pred')
    acc = eval_res['accuracy']
    model = "TextGNN"
    return acc

# Perform grid search
best_acc = 0
best_params = None
results = defaultdict(list)

for params in itertools.product(*param_grid.values()):
    Learning_Rate, Pred_Type, Dropout, Act = params
    print(f"Testing with lr={Learning_Rate}, pred_type={Pred_Type}, act={Act}, dropout={Dropout}")
    
    acc = run_experiment(Learning_Rate, Pred_Type, Dropout, Act)
    results[(Learning_Rate, Pred_Type, Dropout, Act)].append(acc)
    
    # Track the best parameters
    if acc > best_acc:
        best_acc = acc
        best_params = (Learning_Rate, Pred_Type, Dropout, Act)
        print(f"New best model with accuracy {best_acc} and params {best_params}")

# Print best found parameters
print("Best Parameters:", best_params)


Testing with lr=0.01, pred_type=softmax, act=relu, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 4], 'act': 'relu', 'class_weights': True, 'dropout': True}

Number params:  5212404
New best model with accuracy 0.7325 and params (0.01, 'softmax', True, 'relu')
Testing with lr=0.01, pred_type=softmax, act=prelu, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 4], 'act': 'prelu', 'class_weights': True, 'dropout': True}

Number params:  5212604
Testing with lr=0.01, pred_type=softmax, act=sigmoid, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 4], 'act': 'sigmoid', 'class_weights': True, 'dropout': True}

Number params:  5212404
Testing with lr=0.01, pred_type=softmax, act=tanh, dropout=True
TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 4], 'act': 'tanh'

In [13]:
# config
"""
Most Relevant
"""

debug = False
gpu = -1
use_comet_ml = False

"""
dataset:
 sentiment suffix for twitter means the negative classes of the original dataset are combined and the other classes are combined for sentiment analysis
 presplit suffix means training and test are predetermined in [dataset]_labels.txt
 small suffix means a very small dataset used for debugging
"""
random_seed = 123
# dataset = 'r8_presplit'
# dataset = 'ag_presplit'
dataset = 'twitter_asian_prejudice'
if 'twitter_asian_prejudice' in dataset:
    if 'sentiment' in dataset:
        num_labels = 2
    else:
        num_labels = 4
elif 'r8' in dataset:
    num_labels = 8

"""
Model. Pt1
"""

model = "TextGNN"

model_params = {}
use_edge_weights = False
init_type = 'one_hot_init'
if model == 'TextGNN':
    pred_type = 'softmax'
    node_embd_type = 'graphsage'
    layer_dim_list = [200, num_labels]
    num_layers = len(layer_dim_list)
    class_weights = True
    dropout = False
    s = 'TextGraphSAGE:pred_type={},node_embd_type={},num_layers={},layer_dim_list={},act={},' \
        'dropout={},class_weights={}'.format(
        pred_type, node_embd_type, num_layers, "_".join([str(i) for i in layer_dim_list]), 'tanh', dropout, class_weights
    )
    model_params = {
        'pred_type': pred_type,
        'node_embd':  node_embd_type,
        'num_layers': num_layers,
        'layer_dims': layer_dim_list,
        'act': 'tanh',
        'class_weights': class_weights,
        'dropout': dropout
    }
else:
    raise NotImplementedError

print("{}: {}\n".format(model, model_params))

"""
Sampling
"""
word_window_size = 10
validation_window_size = 10

"""
Validation
"""
validation_metric = "accuracy"  # Alternatively, "f1_weighted" or "loss"

use_best_val_model_for_inference = True

"""
Evaluation.
"""
tvt_ratio = [0.8, 0.1, 0.1]
tvt_list = ["train", "test", "val"]

"""
Optimization.
"""
logs = False
lr = 0.001

device = 'cuda:{}'.format(gpu) if torch.cuda.is_available() and gpu != -1 else 'cpu'

num_epochs = 2 if debug else 400

"""
Other info.
"""
# Assuming get_user() and get_host() are function calls that need to be defined or imported
user = get_user()
hostname = get_host()

TextGNN: {'pred_type': 'softmax', 'node_embd': 'graphsage', 'num_layers': 2, 'layer_dims': [200, 4], 'act': 'tanh', 'class_weights': True, 'dropout': False}



In [17]:
import warnings

warnings.filterwarnings('ignore')
dataset = 'twitter_asian_prejudice'
num_experiments = 5
random_seeds = [33, 15, 86, 109, 78]
model = "TextGNN"
all_experiment_results = []
for exp in range(num_experiments):
    random_seed = random_seeds[exp]
    saver = Saver()
    train_data, val_data, test_data, raw_doc_list = load_data()
    
    saved_model, model = train(train_data, val_data, saver, False)
    with torch.no_grad():
        test_loss_model, preds_model = model(train_data.get_pyg_graph(device=device), test_data)
    eval_res = eval(preds_model, test_data, True)
    y_true = eval_res.pop('y_true')
    y_pred = eval_res.pop('y_pred')
    print("Test...experiment ", exp+1)
    pprint(eval_res)
    all_experiment_results.append(eval_res)
    model = "TextGNN"
    
# Calculate mean and standard deviation across experiments
final_metrics = {key: [] for key in all_experiment_results[0]}
for results in all_experiment_results:
    for key in results:
        final_metrics[key].append(results[key])

for metric in final_metrics:
    values = np.array(final_metrics[metric])
    mean = values.mean()
    std = values.std()
    print(f'{metric}: Mean={mean:.6f}, Std={std:.6f}')

Number params:  5212404
Test...experiment  1
{'accuracy': 0.7335,
 'f1_macro': 0.5417735238436896,
 'f1_micro': 0.7335,
 'f1_weighted': 0.7302325299057705,
 'precision_macro': 0.5471763390681791,
 'precision_micro': 0.7335,
 'precision_weighted': 0.7389061471583507,
 'recall_macro': 0.5672478591335415,
 'recall_micro': 0.7335,
 'recall_weighted': 0.7335}
Number params:  5212404
Test...experiment  2
{'accuracy': 0.7365,
 'f1_macro': 0.47897767657709145,
 'f1_micro': 0.7365,
 'f1_weighted': 0.6887778509339222,
 'precision_macro': 0.600049244491693,
 'precision_micro': 0.7365,
 'precision_weighted': 0.7063421654396634,
 'recall_macro': 0.44994194208189886,
 'recall_micro': 0.7365,
 'recall_weighted': 0.7365}
Number params:  5212404
Test...experiment  3
{'accuracy': 0.7235,
 'f1_macro': 0.4726740510532783,
 'f1_micro': 0.7235,
 'f1_weighted': 0.6706321914311322,
 'precision_macro': 0.6029520359105459,
 'precision_micro': 0.7235,
 'precision_weighted': 0.7133096712524563,
 'recall_macro': 0