In [None]:
import torch
import timm
import torchvision.transforms as transforms
from PIL import Image, ImageOps
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import os
import h5py
from torch_geometric.data import Data, DataLoader

# Load model pre-trained dari TIMM dengan memastikan kita mengambil fitur sebelum FC
model = timm.create_model('inception_resnet_v2', pretrained=True, features_only=True)
model.eval()

# Batch 1: Load Image & Preprocessing
def load_image(image_path, target_size=(299, 299)):
    """Memuat gambar dengan mempertahankan aspek rasio dan menambahkan padding jika perlu."""
    image = Image.open(image_path).convert("RGB")
    image.thumbnail(target_size, Image.LANCZOS)
    delta_w = target_size[0] - image.size[0]
    delta_h = target_size[1] - image.size[1]
    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
    image = ImageOps.expand(image, padding, fill=(0, 0, 0))

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    return transform(image).unsqueeze(0)

# Batch 2: Extract Features
def extract_features(image_tensor, model):
    """Mengekstrak fitur dari gambar sesuai ukuran hasil CNN tanpa memaksa ukuran tetap."""
    model.eval()
    with torch.no_grad():
        feature_maps = model(image_tensor)

    features = torch.cat(feature_maps, dim=1)
    W, H = features.shape[2], features.shape[3]  # Menggunakan ukuran asli dari CNN
    print(f"Feature shape: {features.shape}, W={W}, H={H}")
    return features.squeeze(0), W, H

# Batch 3: Create Graph
def create_sparse_feature_graph(features, W, H):
    """Membuat graph berbasis grid dengan mempertahankan struktur spasial asli sesuai hasil CNN."""
    D = features.shape[0]
    feature_map = features.view(D, W * H).T  # Sesuai dengan aspek spasial dari CNN
    G = nx.Graph()

    for i in range(W * H):
        G.add_node(i, feature=feature_map[i].tolist())

    for x in range(W):
        for y in range(H):
            node_idx = x * H + y
            neighbors = [(x+dx, y+dy) for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]]
            for nx_, ny_ in neighbors:
                if 0 <= nx_ < W and 0 <= ny_ < H:
                    neighbor_idx = nx_ * H + ny_
                    G.add_edge(node_idx, neighbor_idx)

    return G

# Batch 4: Save Features & Graphs to HDF5
def save_graphs_to_hdf5(dataset_path, hdf5_filename="graph_dataset.hdf5"):
    """Menyimpan beberapa graph dari banyak gambar ke dalam HDF5 dengan label untuk klasifikasi GAT."""
    with h5py.File(hdf5_filename, 'w') as hf:
        for label, category in enumerate(["Tidak Estetik", "Estetik"]):  # 0: Tidak Estetik, 1: Estetik
            category_path = os.path.join(dataset_path, category)
            for idx, filename in enumerate(os.listdir(category_path)):
                if filename.endswith(('.jpg', '.png', '.jpeg')):
                    image_path = os.path.join(category_path, filename)
                    print(f"Processing: {image_path}")

                    image_tensor = load_image(image_path)
                    features, W, H = extract_features(image_tensor, model)
                    feature_graph = create_sparse_feature_graph(features, W, H)

                    print(f"[DEBUG] Image {idx} ({category}): W={W}, H={H}, Nodes={W * H}")

                    # Konversi graph ke format PyTorch Geometric
                    node_features = torch.tensor([feature_graph.nodes[n]['feature'] for n in feature_graph.nodes], dtype=torch.float)
                    edge_index = torch.tensor(list(feature_graph.edges)).t().contiguous()
                    graph_data = Data(x=node_features, edge_index=edge_index, y=torch.tensor([label], dtype=torch.long))

                    # Simpan setiap graph dalam grup HDF5
                    grp_name = f"graph_{category}_{idx}_size_{W}x{H}"
                    grp = hf.create_group(grp_name)
                    grp.create_dataset("node_features", data=np.array(graph_data.x, dtype=np.float32))
                    grp.create_dataset("edge_index", data=np.array(graph_data.edge_index, dtype=np.int64))
                    grp.attrs["label"] = label
                    grp.attrs["W"] = W
                    grp.attrs["H"] = H

    print(f"Graph dataset disimpan dalam {hdf5_filename}")

# Jalankan pipeline
dataset_path = 'samples/'  # Path ke dataset yang berisi folder 'Estetik' dan 'Tidak Estetik'
save_graphs_to_hdf5(dataset_path)

Processing: samples/Tidak Estetik\000_00198.jpg


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 147 but got size 71 for tensor number 1 in the list.

# New Section

In [None]:
import torch
import timm
import torchvision.transforms as transforms
from PIL import Image, ImageOps
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import os
import h5py
import pandas as pd
from torch_geometric.data import Data, DataLoader

# Load model pre-trained dari TIMM untuk ekstraksi fitur
model = timm.create_model('inception_resnet_v2', pretrained=True, features_only=True)
model.eval()

# Fungsi memuat gambar dengan mempertahankan rasio aspek menggunakan padding
def load_image(image_path, target_size=(299, 299)):
    image = Image.open(image_path).convert("RGB")
    original_size = image.size  # Simpan ukuran asli gambar

    # Menjaga rasio aspek dengan menambahkan padding hitam
    image.thumbnail(target_size, Image.LANCZOS)
    delta_w = target_size[0] - image.size[0]
    delta_h = target_size[1] - image.size[1]
    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
    image = ImageOps.expand(image, padding, fill=(0, 0, 0))

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    return transform(image).unsqueeze(0), original_size

# Fungsi ekstraksi fitur dari Inception-ResNet-v2 untuk digunakan dalam Graph Attention Network (GAT)
def extract_features(image_tensor, model):
    model.eval()
    with torch.no_grad():
        feature_maps = model(image_tensor)  # Mengambil semua layer fitur

    # Gunakan resolusi spasial dari layer terakhir sebagai ukuran target
    target_W, target_H = feature_maps[-1].shape[2], feature_maps[-1].shape[3]

    # Interpolasi semua feature maps ke ukuran dari layer terakhir
    resized_features = [torch.nn.functional.interpolate(f, size=(target_W, target_H), mode='bilinear', align_corners=False) for f in feature_maps]

    # Gabungkan semua feature maps dalam dimensi kedalaman
    features = torch.cat(resized_features, dim=1).squeeze(0)  # Bentuk akhir: (D, W, H)

    print(f"Extracted Features Shape: {features.shape}, W={target_W}, H={target_H}")
    return features, target_W, target_H

# Fungsi membuat graph berbasis fitur dengan koneksi tetangga terdekat
def create_feature_graph(features, W, H):
    D = features.shape[0]
    feature_map = features.view(D, W * H).T.numpy()
    G = nx.Graph()

    for i in range(W * H):
        G.add_node(i, feature=feature_map[i])

    for x in range(W):
        for y in range(H):
            node_idx = x * H + y
            neighbors = [(x+dx, y+dy) for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]]
            for nx_, ny_ in neighbors:
                if 0 <= nx_ < W and 0 <= ny_ < H:
                    neighbor_idx = nx_ * H + ny_
                    G.add_edge(node_idx, neighbor_idx)

    return G

# Fungsi menyimpan dataset ke HDF5 dan CSV untuk digunakan dalam GNN (Graph Neural Networks)
def save_graphs_to_hdf5_and_csv(dataset_path, hdf5_filename="graph_dataset.hdf5", csv_filename="graph_dataset.csv"):
    csv_data = []

    with h5py.File(hdf5_filename, 'w') as hf:
        for label, category in enumerate(["Tidak Estetik", "Estetik"]):
            category_path = os.path.join(dataset_path, category)
            for idx, filename in enumerate(os.listdir(category_path)):
                if filename.endswith(('.jpg', '.png', '.jpeg')):
                    image_path = os.path.join(category_path, filename)
                    image_tensor, original_size = load_image(image_path)
                    features, W, H = extract_features(image_tensor, model)

                    print(f"Processing Image: {filename}, Original Size: {original_size}, Features Shape: {features.shape}, W={W}, H={H}")

                    try:
                        feature_graph = create_feature_graph(features, W, H)

                        # Konversi graph ke format PyTorch Geometric
                        node_features = torch.tensor([feature_graph.nodes[n]['feature'] for n in feature_graph.nodes], dtype=torch.float)
                        edge_index = torch.tensor(list(feature_graph.edges)).t().contiguous()
                        graph_data = Data(x=node_features, edge_index=edge_index, y=torch.tensor([label], dtype=torch.long))

                        # Simpan ke HDF5
                        grp_name = f"graph_{category}_{idx}_size_{W}x{H}"
                        grp = hf.create_group(grp_name)
                        grp.create_dataset("node_features", data=np.array(graph_data.x, dtype=np.float32))
                        grp.create_dataset("edge_index", data=np.array(graph_data.edge_index, dtype=np.int64))
                        grp.attrs["label"] = label
                        grp.attrs["W"] = W
                        grp.attrs["H"] = H
                        grp.attrs["Original_Size"] = original_size
                        print(f"Saved: {grp_name}")

                        # Simpan metadata dan struktur graph ke CSV
                        csv_data.append([
                            filename, category, W, H, original_size[0], original_size[1],
                            graph_data.x.flatten().tolist(),  # Flattened node features
                            graph_data.edge_index.numpy().tolist()  # Edge index
                        ])

                    except Exception as e:
                        print(f"Error processing {filename}: {e}")
                        continue  # Lewati gambar yang menyebabkan error

    # Simpan data ke CSV dengan kolom tambahan untuk graph
    df = pd.DataFrame(csv_data, columns=[
        "Filename", "Category", "W", "H", "Original_Width", "Original_Height", "Node_Features", "Edge_Index"
    ])
    df.to_csv(csv_filename, index=False)
    print(f"CSV dataset disimpan dalam {csv_filename}")
    print(f"Graph dataset disimpan dalam {hdf5_filename}")


# Jalankan pipeline
dataset_path = 'samples/'
save_graphs_to_hdf5_and_csv(dataset_path)

Extracted Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Processing Image: 000_00198.jpg, Original Size: (255, 255), Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Saved: graph_Tidak Estetik_0_size_8x8


  grp.create_dataset("node_features", data=np.array(graph_data.x, dtype=np.float32))
  grp.create_dataset("edge_index", data=np.array(graph_data.edge_index, dtype=np.int64))


Extracted Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Processing Image: 000_00398.jpg, Original Size: (255, 339), Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Saved: graph_Tidak Estetik_1_size_8x8
Extracted Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Processing Image: 000_00413.jpg, Original Size: (255, 344), Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Saved: graph_Tidak Estetik_2_size_8x8
Extracted Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Processing Image: 001_10033.jpg, Original Size: (382, 255), Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Saved: graph_Tidak Estetik_3_size_8x8
Extracted Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Processing Image: 002_00054.jpg, Original Size: (255, 255), Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Saved: graph_Tidak Estetik_4_size_8x8
Extracted Features Shape: torch.Size([3200, 8, 8]), W=8, H=8
Processing Image: 002_00301.jpg, Original Size: (255, 339), Features Shape: torch.Size([3200, 8, 8]

test

In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("graph_dataset.csv")

# Cek apakah semua kolom ada
print(df.columns)

# Cek 5 data pertama
print(df.head())


Index(['Filename', 'Category', 'W', 'H', 'Original_Width', 'Original_Height',
       'Node_Features', 'Edge_Index'],
      dtype='object')
        Filename       Category  W  H  Original_Width  Original_Height  \
0  000_00198.jpg  Tidak Estetik  8  8             255              255   
1  000_00398.jpg  Tidak Estetik  8  8             255              339   
2  000_00413.jpg  Tidak Estetik  8  8             255              344   
3  001_10033.jpg  Tidak Estetik  8  8             382              255   
4  002_00054.jpg  Tidak Estetik  8  8             255              255   

                                       Node_Features  \
0  [0.7795566916465759, 0.7630141377449036, 0.272...   
1  [0.10722190886735916, 1.1118348836898804, 0.08...   
2  [0.10722190886735916, 1.1118348836898804, 0.08...   
3  [0.10722190886735916, 1.1118348836898804, 0.08...   
4  [0.8973767161369324, 0.49998754262924194, 0.17...   

                                          Edge_Index  
0  [[0, 0, 1, 1, 2, 2, 3

In [None]:
import ast
from torch_geometric.utils import from_networkx

# Fungsi memuat dataset dari CSV ke dalam PyTorch Geometric Dataset
class GraphDataset(Dataset):
    def __init__(self, csv_file):
        super(GraphDataset, self).__init__()
        self.data = pd.read_csv(csv_file)

    def len(self):
        return len(self.data)

    def get(self, idx):
        row = self.data.iloc[idx]

        # Konversi fitur node dari string ke tensor
        node_features = torch.tensor(ast.literal_eval(row["Node_Features"]), dtype=torch.float)

        # Konversi edge index dari string ke tensor
        edge_index = torch.tensor(ast.literal_eval(row["Edge_Index"]), dtype=torch.long).t().contiguous()

        # Konversi kategori ke label 0/1
        label = torch.tensor([1 if row["Category"] == "Estetik" else 0], dtype=torch.long)

        # Membuat batch index agar bisa di-batch dalam GAT
        batch = torch.zeros(node_features.shape[0], dtype=torch.long)

        return Data(x=node_features, edge_index=edge_index, y=label, batch=batch)

# Load dataset
csv_filename = "graph_dataset.csv"
dataset = GraphDataset(csv_filename)

# Debugging
print(f"Total Graphs: {len(dataset)}")
sample = dataset.get(0)
print(f"Sample Node Features Shape: {sample.x.shape}")
print(f"Sample Edge Index Shape: {sample.edge_index.shape}")
print(f"Sample Label: {sample.y}")


Total Graphs: 20
Sample Node Features Shape: torch.Size([204800])
Sample Edge Index Shape: torch.Size([112, 2])
Sample Label: tensor([0])


In [None]:
from torch_geometric.nn import GATConv, global_mean_pool
import torch.nn as nn

class GATClassifier(torch.nn.Module):
    def __init__(self, in_channels=3200, hidden_channels=512, out_channels=2, heads=8):
        super(GATClassifier, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, dropout=0.6)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.elu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)

        # Gunakan mean pooling untuk mendapatkan satu output per graph
        x = global_mean_pool(x, batch)

        return F.log_softmax(x, dim=1)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training Loop
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)  # Sudah dikonversi dengan Mean Pooling agar ukuran tetap
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Testing Loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.y.size(0)
    return correct / total

# Jalankan Training
epochs = 50
for epoch in range(epochs):
    loss = train()
    acc = test()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}, Accuracy: {acc:.4f}")


IndexError: tuple index out of range

In [None]:
def save_features_to_hdf5(folder_path, hdf5_filename="dataset_features.hdf5"):
    """Menyimpan fitur dari gambar ke dalam HDF5 tanpa reduksi fitur."""
    with h5py.File(hdf5_filename, 'w') as hf:
        for idx, filename in enumerate(os.listdir(folder_path)):
            if filename.endswith(('.jpg', '.png', '.jpeg')):
                image_path = os.path.join(folder_path, filename)
                print(f"Processing: {image_path}")

                image_tensor = load_image(image_path)
                features, W, H = extract_features(image_tensor, model)

                grp = hf.create_group(f"image_{idx}")
                grp.create_dataset("features", data=features.numpy())
                grp.attrs["W"] = W
                grp.attrs["H"] = H

    print(f"Dataset fitur disimpan dalam {hdf5_filename}")

In [None]:

# Jalankan pipeline
folder_path = 'samples/'
save_features_to_hdf5(folder_path)

Processing: samples/000_00198.jpg


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 147 but got size 71 for tensor number 1 in the list.

In [None]:
import torch
import timm
import torchvision.transforms as transforms
from PIL import Image, ImageOps
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import os
import h5py

# Load model pre-trained dari TIMM dengan memastikan kita mengambil fitur sebelum FC
model = timm.create_model('inception_resnet_v2', pretrained=True, features_only=True)
model.eval()

# Batch 1: Load Image & Preprocessing
def load_image(image_path, target_size=(299, 299)):
    """Memuat gambar dengan mempertahankan aspek rasio dan menambahkan padding jika perlu."""
    image = Image.open(image_path).convert("RGB")
    image.thumbnail(target_size, Image.LANCZOS)
    delta_w = target_size[0] - image.size[0]
    delta_h = target_size[1] - image.size[1]
    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
    image = ImageOps.expand(image, padding, fill=(0, 0, 0))

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    return transform(image).unsqueeze(0)

def extract_features(image_tensor, model, fixed_size=(8, 8)):
    """Mengekstrak fitur dari gambar dan memastikan ukuran fitur tetap."""
    model.eval()
    with torch.no_grad():
        feature_maps = model(image_tensor)

    # **Interpolasi agar semua feature maps memiliki ukuran tetap**
    features = torch.cat([
        torch.nn.functional.interpolate(f, size=fixed_size, mode='bilinear')
        for f in feature_maps
    ], dim=1)

    print(f"[DEBUG] Feature shape after resizing: {features.shape}")  # Debugging ukuran fitur
    return features.squeeze(0), fixed_size[0], fixed_size[1]


# Batch 3: Create Graph
def create_sparse_feature_graph(features, W, H):
    """Membuat graph berbasis grid dengan mempertahankan struktur spasial asli."""
    D = features.shape[0]
    feature_map = features.view(D, -1).T
    G = nx.Graph()

    for i in range(W * H):
        G.add_node(i, feature=feature_map[i])

    for x in range(W):
        for y in range(H):
            node_idx = x * H + y
            neighbors = [(x+dx, y+dy) for dx, dy in [(-1,0), (1,0), (0,-1), (0,1)]]
            for nx_, ny_ in neighbors:
                if 0 <= nx_ < W and 0 <= ny_ < H:
                    neighbor_idx = nx_ * H + ny_
                    G.add_edge(node_idx, neighbor_idx)

    return G

# Batch 4: Save Features & Graphs to HDF5
def save_features_and_graphs_to_hdf5(folder_path, hdf5_filename="dataset_features.hdf5"):
    """Menyimpan fitur dan graph ke dalam HDF5 tanpa mereduksi fitur."""
    with h5py.File(hdf5_filename, 'w') as hf:
        for idx, filename in enumerate(os.listdir(folder_path)):
            if filename.endswith(('.jpg', '.png', '.jpeg')):
                image_path = os.path.join(folder_path, filename)
                print(f"Processing: {image_path}")

                image_tensor = load_image(image_path)
                features, W, H = extract_features(image_tensor, model)
                feature_graph = create_sparse_feature_graph(features, W, H)

                grp = hf.create_group(f"image_{idx}")
                grp.create_dataset("features", data=features.numpy())
                grp.attrs["W"] = W
                grp.attrs["H"] = H

                # Simpan adjacency matrix dari graph
                adj_matrix = nx.to_numpy_array(feature_graph)
                grp.create_dataset("adjacency_matrix", data=adj_matrix)

    print(f"Dataset fitur dan graph disimpan dalam {hdf5_filename}")

# Jalankan pipeline
folder_path = 'samples/'
save_features_and_graphs_to_hdf5(folder_path)

Processing: samples/000_00198.jpg
[DEBUG] Feature shape after resizing: torch.Size([1, 3200, 8, 8])
Processing: samples/000_00644.jpg
[DEBUG] Feature shape after resizing: torch.Size([1, 3200, 8, 8])
Processing: samples/000_10075.jpg
[DEBUG] Feature shape after resizing: torch.Size([1, 3200, 8, 8])
Processing: samples/001_00632.jpg
[DEBUG] Feature shape after resizing: torch.Size([1, 3200, 8, 8])
Processing: samples/004_00165.jpg
[DEBUG] Feature shape after resizing: torch.Size([1, 3200, 8, 8])
Dataset fitur dan graph disimpan dalam dataset_features.hdf5


## TESTING CODE GNN


In [None]:
edges_path = 'datasets-master/git_web_ml/git_edges.csv'
targets_path = 'datasets-master/git_web_ml/git_target.csv'
features_path = 'datasets-master/git_web_ml/git_features.json'

# Read in edges
edges = pd.read_csv(edges_path)
edges.columns = ['source', 'target'] # renaming for StellarGraph compatibility

# Read in features
with open(features_path) as json_data:
    features = json.load(json_data)

max_feature = np.max([v for v_list in features.values() for v in v_list])
features_matrix = np.zeros(shape = (len(list(features.keys())), max_feature+1))

i = 0
for k, vs in tqdm(features.items()):
    for v in vs:
        features_matrix[i, v] = 1
    i+=1

node_features = pd.DataFrame(features_matrix, index = features.keys()) # into dataframe for StellarGraph

# Read in targets
targets = pd.read_csv(targets_path)
targets.index = targets.id.astype(str)
targets = targets.loc[features.keys(), :]

NameError: name 'pd' is not defined

In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import torch
from torch_geometric.data import Data
from sklearn import preprocessing

edges_path = 'git_web_ml/musae_git_edges.csv'
targets_path = 'git_web_ml/musae_git_target.csv'
features_path = 'git_web_ml/musae_git_features.json'

# Read in edges
edges = pd.read_csv(edges_path)
edges.columns = ['source', 'target'] # renaming for StellarGraph compatibility

# Read in features
with open(features_path) as json_data:
    features = json.load(json_data)

max_feature = np.max([v for v_list in features.values() for v in v_list])
features_matrix = np.zeros(shape = (len(list(features.keys())), max_feature+1))

i = 0
for k, vs in tqdm(features.items()):
    for v in vs:
        features_matrix[i, v] = 1
    i+=1

node_features = pd.DataFrame(features_matrix, index = features.keys()) # into dataframe for StellarGraph

# Read in targets
targets = pd.read_csv(targets_path)
targets.index = targets.id.astype(str)
targets = targets.loc[features.keys(), :]

100%|██████████| 37700/37700 [00:00<00:00, 66851.11it/s]


In [None]:
# Konversi ke format PyG
import torch
from torch_geometric.data import Data
from sklearn import preprocessing

# Konversi edge list menjadi tensor [2, num_edges]
edge_index = torch.tensor(edges.values.T, dtype=torch.long)

# Konversi fitur ke tensor [num_nodes, num_features]
x = torch.tensor(node_features.values, dtype=torch.float)

# Encode label ke angka (misalnya kolom "ml_target")
label_encoder = preprocessing.LabelEncoder()
y = torch.tensor(label_encoder.fit_transform(targets["ml_target"]), dtype=torch.long)

# Buat objek Data dari PyTorch Geometric
data = Data(x=x, edge_index=edge_index, y=y)

# (Opsional) Cetak informasi graph
print(data)
print(f"Jumlah node: {data.num_nodes}")
print(f"Jumlah edge: {data.num_edges}")
print(f"Dimensi fitur: {data.num_node_features}")
print(f"Jumlah kelas: {len(label_encoder.classes_)}")


Data(x=[37700, 4005], edge_index=[2, 289003], y=[37700])
Jumlah node: 37700
Jumlah edge: 289003
Dimensi fitur: 4005
Jumlah kelas: 2


In [None]:
from sklearn.model_selection import train_test_split

train_pages, test_pages = train_test_split(targets, train_size=200)
val_pages, test_pages = train_test_split(test_pages, train_size=200)
print(train_pages.shape, val_pages.shape, test_pages.shape)


(200, 3) (200, 3) (37300, 3)


In [None]:
from sklearn.preprocessing import LabelEncoder

target_encoding = LabelEncoder()

train_targets = torch.tensor(target_encoding.fit_transform(train_pages['ml_target']), dtype=torch.long)
val_targets = torch.tensor(target_encoding.transform(val_pages['ml_target']), dtype=torch.long)
test_targets = torch.tensor(target_encoding.transform(test_pages['ml_target']), dtype=torch.long)


In [None]:
from torch_geometric.utils import add_self_loops, degree

# 1. Tambahkan self-loop
edge_index, _ = add_self_loops(data.edge_index, num_nodes=data.num_nodes)

# 2. Hitung derajat tiap node
row, col = edge_index
deg = degree(col, data.num_nodes, dtype=torch.float)

# 3. Hitung D^(-1/2)
deg_inv_sqrt = deg.pow(-0.5)
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0

# 4. Hitung edge_weight ternormalisasi
edge_weight = deg_inv_sqrt[row] * deg_inv_sqrt[col]

# edge_index + edge_weight = adjacency ternormalisasi versi GCN
# Bisa dipakai langsung di GCNConv:
from torch_geometric.nn import GCNConv

conv1 = GCNConv(in_channels=data.num_node_features, out_channels=16)

x = conv1(data.x, edge_index, edge_weight=edge_weight)


In [None]:
import torch

# Buat boolean mask (default: semua False)
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

# Isi dengan True untuk node yang digunakan
train_mask[train_pages.index.astype(int)] = True
val_mask[val_pages.index.astype(int)] = True
test_mask[test_pages.index.astype(int)] = True

# Tambahkan ke objek data
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
