In [21]:
import pandas

# import the data
df = pandas.read_csv('../data/tcga_brca_all_clean.csv')
df = df.iloc[:, 9:]

df_cnv = df.filter(regex='_cnv', axis=1)
df_rnaseq = df.filter(regex='_rnaseq', axis=1)
df_cnv

Unnamed: 0,NOTCH2_cnv,ACP6_cnv,ANKRD20A12P_cnv,ANKRD34A_cnv,ANKRD35_cnv,BCL9_cnv,BOLA1_cnv,CD160_cnv,CHD1L_cnv,FAM72B_cnv,...,SLC2A4RG_cnv,SOX18_cnv,STMN3_cnv,TCEA2_cnv,TNFRSF6B_cnv,TPD52L2_cnv,UCKL1_cnv,ZBTB46_cnv,ZGPAT_cnv,ZNF512B_cnv
0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,2,2,2,2,2,2,2,2,2,2
1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,2,2,2,2,2,2,2,2,2,2
2,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
3,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,2,2,2,2,2,2,2,2,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1021,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Transpose: rows=genes, cols=samples
gene_features = df_cnv.T
genes = gene_features.index.tolist()

print(gene_features.shape)  # Should be (2523, 1023)

(2523, 1023)


In [23]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between genes
sim_matrix = cosine_similarity(gene_features)

# Threshold to form edges
threshold = 0.9  # adjust based on sparsity you desire
edge_index = np.argwhere(sim_matrix > threshold)

# Remove self-loops and duplicates
edge_index = edge_index[edge_index[:,0] != edge_index[:,1]]
edge_index = edge_index.T

print(f"Number of edges: {edge_index.shape[1]}")


Number of edges: 426962


In [24]:
import torch
from torch_geometric.data import Data

# Node features: CNV profiles (2523 genes x 1023 samples)
x = torch.tensor(gene_features.values, dtype=torch.float)

# Edge index (2, num_edges)
edge_index = torch.tensor(edge_index, dtype=torch.long)

data = Data(x=x, edge_index=edge_index)

print(data)


Data(x=[2523, 1023], edge_index=[2, 426962])


In [29]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, BatchNorm

class ComplexCNVGAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=256, embed_dim=64, heads=4, dropout=0.3):
        super(ComplexCNVGAT, self).__init__()
        
        self.gat1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.norm1 = BatchNorm(hidden_dim * heads)
        
        self.gat2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=dropout)
        self.norm2 = BatchNorm(hidden_dim * heads)

        # Final embedding layer (single head)
        self.gat3 = GATConv(hidden_dim * heads, embed_dim, heads=1, concat=True, dropout=dropout)
        self.norm3 = BatchNorm(embed_dim)
        
        self.dropout = dropout

    def forward(self, x, edge_index):
        # Layer 1
        x = self.gat1(x, edge_index)
        x = self.norm1(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Layer 2
        x = self.gat2(x, edge_index)
        x = self.norm2(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Final Embedding Layer
        x = self.gat3(x, edge_index)
        x = self.norm3(x)
        
        return x


In [30]:
# Initialize model
input_dim = 1023  # Number of patients/samples
hidden_dim = 256
embed_dim = 64
heads = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ComplexCNVGAT(input_dim, hidden_dim, embed_dim, heads=heads).to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
model.train()

data = data.to(device)
model.train()

epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    embeddings = model(data.x, data.edge_index)

    # Unsupervised scenario: Graph reconstruction or embedding smoothness
    # For demonstration, using smoothness loss (nodes close in graph should have similar embeddings)
    loss = torch.norm(embeddings[data.edge_index[0]] - embeddings[data.edge_index[1]], p=2).mean()

    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


Epoch 0, Loss: 950.5826
Epoch 10, Loss: 460.2672
Epoch 20, Loss: 434.7167
Epoch 30, Loss: 383.9095
Epoch 40, Loss: 339.6143
Epoch 50, Loss: 313.7188
Epoch 60, Loss: 288.6734
Epoch 70, Loss: 269.1189
Epoch 80, Loss: 247.0448
Epoch 90, Loss: 238.1196
