In [1]:
import numpy as np
import torch
from scipy.sparse import load_npz
import json

adjacency_matrix = load_npz('data_2024/adj.npz')
features = np.load('data_2024/features.npy')
labels = np.load('data_2024/labels.npy')

with open('data_2024/splits.json', 'r') as f:
    splits = json.load(f)

train_indices = splits['idx_train']
test_indices = splits['idx_test']

features_tensor = torch.FloatTensor(features)
labels_tensor = torch.LongTensor(labels)
features_tensor, labels_tensor
print("features_tensor: ", features_tensor.shape)
print("labels_tensor: ",labels_tensor.shape)
print("train_indices: ",len(train_indices))
print("test_indices: ",len(test_indices))

features_tensor:  torch.Size([2480, 1390])
labels_tensor:  torch.Size([496])
train_indices:  496
test_indices:  1984


In [2]:
def adjacency_to_edge_index(adjacency_matrix):
    adjacency_coo = adjacency_matrix.tocoo()
    # Create the edge index from COO format
    row = torch.from_numpy(adjacency_coo.row.astype(np.int64))
    col = torch.from_numpy(adjacency_coo.col.astype(np.int64))
    #  expect a 2 x Num_edges matrix for edge_index
    edge_index = torch.stack([row, col], dim=0)
    return edge_index


edge_index = adjacency_to_edge_index(adjacency_matrix)
print("edge_index: ", edge_index.shape)


edge_index:  torch.Size([2, 10100])


In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch.optim as optim

num_nodes = 2480
# Assuming features_tensor, edge_index, labels_tensor, train_indices are already defined

# Initialize the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Number of node features and classes
num_node_features = features_tensor.shape[1]
num_classes = labels_tensor.max().item() + 1

# Initialize the full-sized labels tensor with placeholders for test nodes
padded_labels_tensor = torch.full((num_nodes,), -1, dtype=torch.long)
padded_labels_tensor[train_indices] = labels_tensor

# Create the train mask
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_indices] = True

# Create the Data object with the full-sized labels tensor
data = Data(x=features_tensor, edge_index=edge_index, y=padded_labels_tensor)
data = data.to(device)
data


Data(x=[2480, 1390], edge_index=[2, 10100], y=[2480])

In [5]:
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_hidden, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, num_hidden)
        self.conv2 = GCNConv(num_hidden, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [6]:
data.y[train_mask]

tensor([5, 0, 2, 0, 6, 6, 1, 6, 1, 2, 0, 6, 2, 3, 0, 2, 2, 6, 4, 2, 2, 3, 4, 5,
        3, 3, 2, 0, 5, 4, 6, 0, 1, 2, 2, 0, 2, 1, 3, 2, 4, 0, 2, 2, 2, 1, 2, 4,
        1, 0, 1, 6, 3, 2, 2, 4, 6, 0, 2, 0, 5, 5, 3, 2, 5, 1, 2, 1, 2, 2, 6, 2,
        2, 5, 4, 0, 2, 0, 1, 0, 0, 2, 3, 5, 1, 6, 1, 0, 2, 2, 2, 1, 1, 1, 4, 0,
        2, 3, 1, 1, 0, 2, 2, 0, 2, 2, 2, 2, 3, 3, 1, 3, 3, 3, 6, 2, 2, 2, 6, 2,
        0, 1, 2, 2, 3, 1, 6, 2, 1, 2, 2, 6, 2, 2, 6, 2, 2, 0, 0, 1, 6, 2, 2, 3,
        2, 5, 0, 3, 6, 6, 1, 1, 2, 3, 3, 5, 3, 4, 4, 2, 2, 2, 2, 6, 6, 6, 2, 4,
        3, 3, 1, 0, 3, 2, 2, 2, 4, 4, 2, 5, 6, 1, 3, 3, 5, 3, 6, 6, 2, 2, 3, 1,
        2, 6, 6, 3, 5, 6, 3, 4, 1, 3, 6, 6, 0, 2, 3, 6, 2, 6, 1, 4, 4, 3, 4, 1,
        4, 3, 0, 2, 3, 3, 0, 1, 2, 2, 0, 2, 2, 6, 0, 1, 2, 2, 5, 6, 0, 2, 0, 0,
        1, 0, 6, 3, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 2, 1, 2, 2, 1, 2, 0, 6, 1,
        6, 2, 3, 6, 6, 6, 2, 4, 4, 0, 0, 4, 0, 4, 4, 4, 4, 6, 6, 3, 1, 1, 2, 2,
        2, 5, 6, 2, 3, 3, 3, 3, 3, 2, 2,

In [7]:
from sklearn.model_selection import KFold
import numpy as np

# Assuming the initial setup as before
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Convert train_indices to a numpy array for easy indexing
train_indices_np = np.array(train_indices)

# Cross-validation
accuracies = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train_indices_np)):
    print(f"Fold {fold+1}")

    # Create masks for the current fold
    fold_train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    fold_val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    fold_train_mask[train_indices_np[train_idx]] = True
    fold_val_mask[train_indices_np[val_idx]] = True

    # Initialize the model and optimizer
    model = GCN(num_node_features, 16, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    # Train
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data.x.to(device), data.edge_index.to(device))
        loss = F.nll_loss(F.log_softmax(out[fold_train_mask], dim=1), data.y[fold_train_mask])
        loss.backward()
        optimizer.step()

    # Validate
    model.eval()
    with torch.no_grad():
        out = model(data.x.to(device), data.edge_index.to(device))
        preds = out[fold_val_mask].argmax(dim=1)
        correct = preds.eq(data.y[fold_val_mask]).sum().item()
        total = fold_val_mask.sum().item()
        accuracy = correct / total
        accuracies.append(accuracy)
        print(f"Validation accuracy for fold {fold+1}: {accuracy:.4f}")

# Train on full training set again
model = GCN(num_node_features, 16, num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_indices] = True
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data.x.to(device), data.edge_index.to(device))
    loss = F.nll_loss(F.log_softmax(out[train_mask], dim=1), data.y[train_mask])
    loss.backward()
    optimizer.step()



Fold 1
Validation accuracy for fold 1: 0.8300
Fold 2
Validation accuracy for fold 2: 0.7879
Fold 3
Validation accuracy for fold 3: 0.8182
Fold 4
Validation accuracy for fold 4: 0.8889
Fold 5
Validation accuracy for fold 5: 0.8485


In [8]:
# Predict on the full dataset
with torch.no_grad():
    full_predictions = model(features_tensor.to(device), edge_index.to(device))

# Get the predicted classes for the entire dataset
full_preds_classes = full_predictions.argmax(dim=1)

train_preds_classes = full_preds_classes[train_indices]

train_correct = train_preds_classes.eq(labels_tensor.to(device)).sum().item()
train_total = len(train_indices)  # or labels_tensor.size(0)
train_accuracy = train_correct / train_total

print(f"Training accuracy: {train_accuracy:.4f}")


Training accuracy: 0.9919


In [9]:
model.eval()

with torch.no_grad():
    full_predictions = model(features_tensor.to(device), edge_index.to(device))

# Get the predicted classes for the entire dataset
full_preds_classes = full_predictions.argmax(dim=1)

# Extract the predictions for the test nodes
test_preds_classes = full_preds_classes[test_indices]

# The test_preds_classes now contains the predicted classes for the test nodes
print(test_preds_classes)


tensor([1, 3, 2,  ..., 4, 1, 3])


In [12]:
np.savetxt('submission_zhu.txt', test_preds_classes, fmt='%d')