In [None]:
import pandas

# import the data
df = pandas.read_csv('../data/tcga_brca_all_clean.csv')
df = df.iloc[:, 9:]

df_cnv = df.filter(regex='_cnv', axis=1)
df_rnaseq = df.filter(regex='_rnaseq', axis=1)
df_mut = df.filter(regex='_mut', axis=1)

Unnamed: 0,UBE2Q2P2_rnaseq,SSX9_rnaseq,CXORF67_rnaseq,EFCAB8_rnaseq,SDR16C6P_rnaseq,EFCAB12_rnaseq,A1BG_rnaseq,A1CF_rnaseq,RBFOX1_rnaseq,GGACT_rnaseq,...,ZWINT_rnaseq,ZXDA_rnaseq,ZXDB_rnaseq,ZXDC_rnaseq,ZYG11A_rnaseq,ZYG11B_rnaseq,ZYX_rnaseq,ZZEF1_rnaseq,ZZZ3_rnaseq,TPTEP1_rnaseq
0,1.7786,0.3153,6.6560,2.0252,-0.116,-0.4304,0.2155,-0.0589,-0.2273,-0.6056,...,0.3196,-0.1816,-0.4114,1.0797,1.0743,-1.0477,0.9689,-0.3256,-1.1244,2.4038
1,1.7786,0.3153,6.6560,2.0252,-0.116,-0.4304,0.2155,-0.0589,-0.2273,-0.6056,...,0.3196,-0.1816,-0.4114,1.0797,1.0743,-1.0477,0.9689,-0.3256,-1.1244,2.4038
2,0.2949,-0.0752,-0.1184,0.7840,-0.116,-0.5511,1.1618,0.1349,-0.2273,1.2238,...,3.2911,-1.1222,-0.1302,-1.4793,2.5267,-1.3026,0.9433,-1.1210,-0.9310,-0.6472
3,0.2949,-0.0752,-0.1184,0.7840,-0.116,-0.5511,1.1618,0.1349,-0.2273,1.2238,...,3.2911,-1.1222,-0.1302,-1.4793,2.5267,-1.3026,0.9433,-1.1210,-0.9310,-0.6472
4,0.6318,-0.0752,-0.1184,0.4122,-0.116,-0.0254,-0.0206,-0.0589,-0.2273,-0.7747,...,-0.2431,-0.3657,-0.4472,-1.0557,0.8232,-1.0342,1.0729,-1.4634,-0.5710,-0.5402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,-0.7923,-0.0752,-0.0801,1.0912,-0.116,-0.4945,1.4065,-0.0589,-0.2273,-0.7904,...,-1.1686,-1.5424,-1.0868,-1.4608,-0.0141,-2.0773,3.9959,-1.4221,-1.0560,1.7134
1019,2.5181,-0.0752,0.0112,0.1302,-0.116,-0.5155,0.1434,-0.0589,-0.1814,0.6160,...,-0.6444,0.2435,0.2311,0.7384,-0.8560,-0.7456,0.7535,0.9765,-0.6290,-0.4313
1020,3.5801,-0.0752,0.7033,0.0402,-0.116,-0.0863,0.3059,0.4459,-0.0621,-0.4222,...,-0.1942,-0.6109,-0.7641,1.2241,0.7029,-0.5363,0.9538,-0.0743,-0.0954,0.0371
1021,-0.0576,-0.0752,-0.1184,-0.4279,-0.116,-0.4510,1.2449,-0.0589,-0.1577,-0.3944,...,-1.0210,-1.2416,-1.0705,-1.3571,-0.5215,-1.3964,1.6214,-1.5199,-0.4771,0.0497


In [22]:
# Transpose: rows=genes, cols=samples
gene_features = df_cnv.T
genes = gene_features.index.tolist()

print(gene_features.shape)  # Should be (2523, 1023)

(2523, 1023)


In [23]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between genes
sim_matrix = cosine_similarity(gene_features)

# Threshold to form edges
threshold = 0.9  # adjust based on sparsity you desire
edge_index = np.argwhere(sim_matrix > threshold)

# Remove self-loops and duplicates
edge_index = edge_index[edge_index[:,0] != edge_index[:,1]]
edge_index = edge_index.T

print(f"Number of edges: {edge_index.shape[1]}")


Number of edges: 426962


In [24]:
import torch
from torch_geometric.data import Data

# Node features: CNV profiles (2523 genes x 1023 samples)
x = torch.tensor(gene_features.values, dtype=torch.float)

# Edge index (2, num_edges)
edge_index = torch.tensor(edge_index, dtype=torch.long)

data = Data(x=x, edge_index=edge_index)

print(data)


Data(x=[2523, 1023], edge_index=[2, 426962])


In [29]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, BatchNorm

class ComplexCNVGAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=256, embed_dim=64, heads=4, dropout=0.3):
        super(ComplexCNVGAT, self).__init__()
        
        self.gat1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.norm1 = BatchNorm(hidden_dim * heads)
        
        self.gat2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=dropout)
        self.norm2 = BatchNorm(hidden_dim * heads)

        # Final embedding layer (single head)
        self.gat3 = GATConv(hidden_dim * heads, embed_dim, heads=1, concat=True, dropout=dropout)
        self.norm3 = BatchNorm(embed_dim)
        
        self.dropout = dropout

    def forward(self, x, edge_index):
        # Layer 1
        x = self.gat1(x, edge_index)
        x = self.norm1(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Layer 2
        x = self.gat2(x, edge_index)
        x = self.norm2(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Final Embedding Layer
        x = self.gat3(x, edge_index)
        x = self.norm3(x)
        
        return x


In [30]:
# Initialize model
input_dim = 1023  # Number of patients/samples
hidden_dim = 256
embed_dim = 64
heads = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ComplexCNVGAT(input_dim, hidden_dim, embed_dim, heads=heads).to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
model.train()

data = data.to(device)
model.train()

epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    embeddings = model(data.x, data.edge_index)

    # Unsupervised scenario: Graph reconstruction or embedding smoothness
    # For demonstration, using smoothness loss (nodes close in graph should have similar embeddings)
    loss = torch.norm(embeddings[data.edge_index[0]] - embeddings[data.edge_index[1]], p=2).mean()

    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


Epoch 0, Loss: 950.5826
Epoch 10, Loss: 460.2672
Epoch 20, Loss: 434.7167
Epoch 30, Loss: 383.9095
Epoch 40, Loss: 339.6143
Epoch 50, Loss: 313.7188
Epoch 60, Loss: 288.6734
Epoch 70, Loss: 269.1189
Epoch 80, Loss: 247.0448
Epoch 90, Loss: 238.1196
