In [1]:
import pandas as pd

# Load your raw dataset (assuming it's a CSV)
df = pd.read_csv('../Datasets/processed_data.csv')
print(df.head())


   campaign_item_id  no_of_days  ext_service_id  creative_id  template_id  \
0              2733    0.059322             128         1000         90.0   
1              2733    0.067797              16         1000         90.0   
2              2733    0.076271             128         1000         90.0   
3              2733    0.084746             128         1000         90.0   
4              2733    0.093220               4         1000         90.0   

   advertiser_id  network_id  channel_id  campaign_budget_usd  clicks  ...  \
0           4756         190          32             0.201396       8  ...   
1           4756         190           8             0.201396      44  ...   
2           4756         190           8             0.201396      32  ...   
3           4756         190          64             0.201396      48  ...   
4           4756         190          32             0.201396      20  ...   

   advertiser_emb_40  advertiser_emb_41  advertiser_emb_42  advertis

### Getting all the relevent columns to the relevent nodes

In [2]:
# Campaign Node - Collect features for the campaign node
campaign_features = df[['campaign_item_id'] + [col for col in df.columns if col.startswith('zone_')] + [col for col in df.columns if col.startswith('search_tag_emb_')]]

# Platform Node - Collect features for the platform node 
platform_features = df[['ext_service_id'] + [col for col in df.columns if col.startswith('plateform_')]]

# Advertiser Node - Collect features for the advertiser node 
advertiser_features = df[['advertiser_id'] + [col for col in df.columns if col.startswith('advertiser_emb_')]]

# Creative Node - Collect features for the creative node 
creative_features = df[['creative_id','creative_dimension']]

# Template Node - Collect features for the template node
template_features = df[['template_id']]

# Channel Node - Collect features for the channel node
channel_features = df[[col for col in df.columns if col.startswith('channel_')]]

# Network Node - Collect features for the network node
network_features = df[['network_id']]

# Landing Page Node - Collect features for the landing page node 
landing_page_features = df[[col for col in df.columns if col.startswith('landing_page_emb_')]]

# Time Node - Collect features for the time node (day_of_week, month, day_of_year, etc.)
time_features = df[['day_of_week', 'month', 'day_of_year']]

# Keyword Node - Collect features for the keyword node (keyword embeddings)
keyword_features = df[[col for col in df.columns if col.startswith('keyword_')]]

print(template_features)

       template_id
0             90.0
1             90.0
2             90.0
3             90.0
4             90.0
...            ...
69195         90.0
69196         90.0
69197         90.0
69198         90.0
69199         90.0

[69200 rows x 1 columns]


#### Creating id's for nodes which dont have id

In [3]:
# To uniquely identify each keyword embedding, we'll hash the rows
df['keyword_id'] = keyword_features.apply(lambda row: hash(tuple(row)), axis=1)

In [4]:
# To uniquely identify each keyword embedding, we'll hash the rows
df['landing_page_id'] = landing_page_features.apply(lambda row: hash(tuple(row)), axis=1)

In [5]:
# Create a unique identifier for each time combination
df['time_id'] = df['day_of_week'].astype(str) + '_' + df['month'].astype(str) + '_' + df['day_of_year'].astype(str)

In [6]:
print(type(channel_features['channel_id']))


<class 'pandas.core.series.Series'>


In [7]:
print(channel_features.columns)


Index(['channel_id', 'channel_Display', 'channel_Mobile', 'channel_Search',
       'channel_Social', 'channel_Video'],
      dtype='object')


#### Building the graph structure

In [8]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

# Creating the Nodes    
campaign = torch.tensor(campaign_features.values, dtype=torch.float)  # node features for Campaign
platform = torch.tensor(platform_features.values, dtype=torch.float)  # node features for Platform
advertiser = torch.tensor(advertiser_features.values, dtype=torch.float)  # node features for Advertiser
creative = torch.tensor(creative_features.values, dtype=torch.float)  # node features for Creative
template = torch.tensor(template_features.values, dtype=torch.float) # node features for  template
channel = torch.tensor(channel_features.values, dtype=torch.float)  # node features for channel
time = torch.tensor(time_features.values, dtype=torch.float)   # node features for Time
keyword = torch.tensor(keyword_features.values, dtype=torch.float)
network = torch.tensor(network_features.values, dtype=torch.float)  # node features for Network
landing_page = torch.tensor(landing_page_features.values, dtype=torch.float)

# MAKING CONNECTION BETWEEN NODES
# Mapping IDs to indices for each node type
campaign_id_to_index = {cid: idx for idx, cid in enumerate(campaign_features['campaign_item_id'].unique())}
platform_id_to_index = {pid: idx for idx, pid in enumerate(platform_features['ext_service_id'].unique())}
advertiser_id_to_index = {aid: idx for idx, aid in enumerate(advertiser_features['advertiser_id'].unique())}
network_id_to_index = {nid: idx for idx, nid in enumerate(network_features['network_id'].unique())}
creative_id_to_index = {crid: idx for idx, crid in enumerate(creative_features['creative_id'].unique())}
template_id_to_index = {tid: idx for idx, tid in enumerate(template_features['template_id'].unique())}
channel_id_to_index = {chid: idx for idx, chid in enumerate(channel_features['channel_id'].unique())}
keyword_id_to_index = {kid: idx for idx, kid in enumerate(df['keyword_id'].unique())}
landing_page_id_to_index = {lid: idx for idx, lid in enumerate(df['landing_page_id'].unique())}
time_id_to_index = {tid: idx for idx, tid in enumerate(df['time_id'].unique())}


# CREATING EDGES

# Campaign -> Platform
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['ext_service_id'].map(platform_id_to_index).values
edge_index_campaign_platform = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Advertiser
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['advertiser_id'].map(advertiser_id_to_index).values
edge_index_campaign_advertiser = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Network
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['network_id'].map(network_id_to_index).values
edge_index_campaign_network = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Channel
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['channel_id'].map(channel_id_to_index).values
edge_index_campaign_channel = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Keyword
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['keyword_id'].map(keyword_id_to_index).values
edge_index_campaign_keyword = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Time
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['time_id'].map(time_id_to_index).values
edge_index_campaign_time = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Platform -> Channel
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['channel_id'].map(channel_id_to_index).values
edge_index_platform_channel = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Platform -> Time
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['time_id'].map(time_id_to_index).values
edge_index_platform_time = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Plateform -> Keyword
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['keyword_id'].map(keyword_id_to_index).values
edge_index_campaign_keyword = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Creative -> Campaign
source_nodes = df['creative_id'].map(creative_id_to_index).values
target_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
edge_index_creative_campaign = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Creative -> Template
source_nodes = df['creative_id'].map(creative_id_to_index).values
target_nodes = df['template_id'].map(template_id_to_index).values
edge_index_creative_template = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> LandingPage
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['landing_page_id'].map(landing_page_id_to_index).values
edge_index_campaign_landingpage = torch.tensor([source_nodes, target_nodes], dtype=torch.long)



  edge_index_campaign_platform = torch.tensor([source_nodes, target_nodes], dtype=torch.long)


In [9]:
edge_features_campaign_platform = df[['no_of_days', 'media_cost_usd'] + [col for col in df.columns if col.startswith('weekday_week_')]]
edge_features_campaign_channel = df[['media_cost_usd','normalized_impressions']]
edge_features_campaign_clicks = df[['clicks','campaign_budget_usd']]


In [10]:
print(campaign.shape)
print(platform.shape)
print(advertiser.shape)
print(creative.shape)
print(time.shape)
print(channel.shape)
print(network.shape)
print(landing_page.shape)
print(keyword.shape)



torch.Size([69200, 58])
torch.Size([69200, 4])
torch.Size([69200, 51])
torch.Size([69200, 2])
torch.Size([69200, 3])
torch.Size([69200, 6])
torch.Size([69200, 1])
torch.Size([69200, 50])
torch.Size([69200, 50])


In [11]:
import torch
import torch.nn.functional as F

# Define maximum feature dimension (based on the largest tensor)
max_dim = max(campaign.shape[1], platform.shape[1], advertiser.shape[1], creative.shape[1], 
              time.shape[1], channel.shape[1], network.shape[1], landing_page.shape[1], keyword.shape[1])

def pad_tensor(tensor, max_dim):
    """Pads tensor to match max_dim."""
    current_dim = tensor.shape[1]
    if current_dim < max_dim:
        padding_size = max_dim - current_dim
        # Pad the tensor with zeros
        tensor = F.pad(tensor, (0, padding_size), value=0)
    return tensor

# Padding the feature tensors
campaign = pad_tensor(campaign, max_dim)
platform = pad_tensor(platform, max_dim)
advertiser = pad_tensor(advertiser, max_dim)
creative = pad_tensor(creative, max_dim)
time = pad_tensor(time, max_dim)
channel = pad_tensor(channel, max_dim)
network = pad_tensor(network, max_dim)
landing_page = pad_tensor(landing_page, max_dim)
keyword = pad_tensor(keyword, max_dim)

# Check that all tensors have the same dimension
print(campaign.shape)
print(platform.shape)
print(advertiser.shape)
print(creative.shape)
print(time.shape)
print(channel.shape)
print(network.shape)
print(landing_page.shape)
print(keyword.shape)

# Now you can safely concatenate them
node_features = torch.cat([campaign, platform, advertiser, creative, time, channel, network, landing_page, keyword], dim=0)

print(node_features.shape)


torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([622800, 58])


In [12]:
import torch
import torch.nn.functional as F

# Function to pad edge features to the maximum number of columns
def pad_edge_features(*edge_features):
    max_columns = max([feature.shape[1] for feature in edge_features])  # Find the max column size
    padded_edge_features = []
    
    for feature in edge_features:
        if feature.shape[1] < max_columns:
            # Padding with zeros (0) to match the max column size
            padding_size = max_columns - feature.shape[1]
            padded_feature = F.pad(feature, (0, padding_size), value=0)  # Pad only along the last dimension (columns)
            padded_edge_features.append(padded_feature)
        else:
            padded_edge_features.append(feature)
    
    return padded_edge_features

# Convert edge features to tensors if they are DataFrames or Series
edge_attr_campaign_platform = torch.tensor(edge_features_campaign_platform.values, dtype=torch.float) if isinstance(edge_features_campaign_platform, pd.DataFrame) else edge_attr_campaign_platform
edge_attr_campaign_channel = torch.tensor(edge_features_campaign_channel.values, dtype=torch.float) if isinstance(edge_features_campaign_channel, pd.DataFrame) else edge_features_campaign_channel
edge_attr_campaign_clicks = torch.tensor(edge_features_campaign_clicks.values, dtype=torch.float) if isinstance(edge_features_campaign_clicks, pd.DataFrame) else edge_features_campaign_clicks


# Pad edge features to have the same number of columns
edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks = pad_edge_features(
    edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks
)

# Concatenate edge features after padding
edge_attr = torch.cat([edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks], dim=0)

# Check the shape of concatenated edge features
print(edge_attr.shape)


torch.Size([207600, 4])


In [13]:
from torch_geometric.data import Data

# Define edge_index (source and target nodes)
edge_index = torch.cat([edge_index_campaign_platform, edge_index_campaign_advertiser, edge_index_creative_campaign,
                        edge_index_campaign_time, edge_index_campaign_channel, edge_index_campaign_network,
                        edge_index_campaign_landingpage, edge_index_campaign_keyword,edge_index_creative_template,
                        edge_index_platform_channel,edge_index_platform_time], dim=1)  # Add all edge indices

# Define edge attributes (edge features)
edge_attr = torch.cat([edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks ], dim=0)

# Combine node features into one tensor
node_features = torch.cat([campaign, platform, advertiser, creative, time, channel, network, landing_page, keyword], dim=0)


In [15]:
# Check the size of the edge_index and edge_attr
num_edges = edge_index.shape[1]  # Number of edges in the graph
num_edge_features = edge_attr.shape[1] if edge_attr.shape[0] > 0 else 0  # Number of features per edge

# If there are missing edge features, pad with zeros
# First, calculate how many additional edge features are needed
if num_edges > edge_attr.shape[0]:
    missing_edge_features = num_edges - edge_attr.shape[0]
    
    # Create a tensor of zeros to pad missing edge features
    default_edge_features = torch.zeros((missing_edge_features, num_edge_features), dtype=torch.float)
    
    # Concatenate the default edge features to match the number of edges
    edge_attr = torch.cat([edge_attr, default_edge_features], dim=0)

# Now, create the data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

# Verify the sizes
print("Node features shape:", node_features.shape)
print("Edge index shape:", edge_index.shape)
print("Edge attributes shape:", edge_attr.shape)
print(data)

Node features shape: torch.Size([622800, 58])
Edge index shape: torch.Size([2, 761200])
Edge attributes shape: torch.Size([761200, 4])
Data(x=[622800, 58], edge_index=[2, 761200], edge_attr=[761200, 4])


In [16]:
assert edge_attr.shape[0] == edge_index.shape[1], "Edge attribute count mismatch"

In [17]:
num_nodes_campaigns = len(df['campaign_item_id'].unique())  # Count unique campaign nodes
num_nodes_plateform = len(df['ext_service_id'].unique())
num_nodes_channel = len(df['channel_id'].unique())
num_nodes_time = len(df['time_id'].unique())
num_nodes_template = len(df['template_id'].unique())
num_nodes_creative = len(df['creative_id'].unique())
num_nodes_network = len(df['network_id'].unique())
num_nodes_advertiser = len(df['advertiser_id'].unique())
num_nodes_keyword = len(df['keyword_id'].unique())
num_nodes_landingpage = len(df['landing_page_id'].unique())
print(num_nodes_campaigns + num_nodes_plateform + num_nodes_channel + num_nodes_time + num_nodes_template + num_nodes_creative + num_nodes_network +
      num_nodes_advertiser + num_nodes_keyword + num_nodes_landingpage)

3131


In [18]:
print("Edge Index (Campaign -> Platform):", edge_index_campaign_platform)
print("Edge Index (Campaign -> Advertiser):", edge_index_campaign_advertiser)
print("Edge Index (Campaign -> Network):", edge_index_campaign_network)
print("Edge Index (Campaign -> Channel):", edge_index_campaign_channel)
print("Edge Index (Campaign -> Keyword):", edge_index_campaign_keyword)
print("Edge Index (Campaign -> Time):", edge_index_campaign_time)
print("Edge Index (Platform -> Channel):", edge_index_platform_channel)



Edge Index (Campaign -> Platform): tensor([[  0,   0,   0,  ..., 140, 140, 140],
        [  0,   1,   0,  ...,   0,   0,   1]])
Edge Index (Campaign -> Advertiser): tensor([[  0,   0,   0,  ..., 140, 140, 140],
        [  0,   0,   0,  ...,  11,  11,  11]])
Edge Index (Campaign -> Network): tensor([[  0,   0,   0,  ..., 140, 140, 140],
        [  0,   0,   0,  ...,   5,   5,   5]])
Edge Index (Campaign -> Channel): tensor([[  0,   0,   0,  ..., 140, 140, 140],
        [  0,   1,   1,  ...,   4,   1,   1]])
Edge Index (Campaign -> Keyword): tensor([[ 0,  1,  0,  ...,  0,  0,  1],
        [ 0,  1,  2,  ..., 50, 36, 26]])
Edge Index (Campaign -> Time): tensor([[  0,   1,   0,  ...,   0,   0,   1],
        [  0,   1,   2,  ..., 207, 208, 209]])
Edge Index (Platform -> Channel): tensor([[0, 1, 0,  ..., 0, 0, 1],
        [0, 1, 1,  ..., 4, 1, 1]])


In [19]:
# import torch
# from torch_geometric.data import Data, DataLoader

# # List to hold Data objects for each graph
# graph_data_list = []

# # Loop through the DataFrame to create graphs
# for _, row in df.iterrows():
#     # Fetch node features and other graph-related data from your dataset (modify as per your needs)
#     node_features = torch.randn(3131, 58)  # Replace with your actual node features
#     edge_index = torch.randint(0, 100, (2, 150))  # Replace with actual edge indices
#     edge_attr = torch.randn(150, 4)  # Replace with actual edge attributes
    
#     # Create Data object for this graph
#     graph = Data(
#         x=node_features,  # Node features
#         edge_index=edge_index,  # Edge indices
#         edge_attr=edge_attr,  # Edge attributes
#         y=torch.tensor([row['clicks']])  # Target (clicks for this graph)
#     )
    
#     # Add the graph to the list
#     graph_data_list.append(graph)

# # Define a DataLoader for batching
# batch_size = 2  # Number of graphs per batch
# data_loader = DataLoader(graph_data_list, batch_size=batch_size, shuffle=True)

# # Iterate through the DataLoader
# for batch in data_loader:
#     print("Batched Node Features Shape:", batch.x.shape)
#     print("Batched Edge Index Shape:", batch.edge_index.shape)
#     print("Batched Edge Attributes Shape:", batch.edge_attr.shape)
#     print("Batched Targets (data.y):", batch.y)
#     print("Batch object:", batch)
#     break  # To only show one batch example


In [20]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, global_mean_pool

class GNNModel(nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels=64):
        super(GNNModel, self).__init__()
        
        # Define the GCN layers
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, out_channels)
        
        # If needed, add a fully connected layer for final predictions
        self.fc = nn.Linear(out_channels, 1)  # For click prediction (regression)
        
    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        
        # Apply the first convolution layer with ReLU activation
        x = self.conv1(x, edge_index, edge_attr)
        x = torch.relu(x)
        
        # Apply the second convolution layer
        x = self.conv2(x, edge_index, edge_attr)
        x = torch.relu(x)
        
        # Apply the third convolution layer
        x = self.conv3(x, edge_index, edge_attr)
        
        # Use global mean pooling (can be modified depending on your task)
        x = global_mean_pool(x, data.batch)  # Data.batch assumes batched graphs, otherwise, skip
        
        # Final prediction using a fully connected layer
        out = self.fc(x)
        return out


In [21]:
import torch.optim as optim
from torch import nn

# Initialize the model
model = GNNModel(in_channels=node_features.shape[1], out_channels=64)  # Adjust as needed

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()  # For regression (click prediction)

# Training loop
num_epochs = 100  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()




    # Forward pass
    out = model(data)  # Pass the data to the model

    # Assuming 'data.y' contains the target values (clicks in this case)
    loss = criterion(out, data.y)  # 'data.y' should contain the actual number of clicks

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    # Print the loss every few epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')


RuntimeError: The size of tensor a (5403916) must match the size of tensor b (1350979) at non-singleton dimension 0

In [23]:
campaign_features = campaign_features.to_numpy()
platform_features = platform_features.to_numpy()
advertiser_features = advertiser_features.to_numpy()
network_features = network_features.to_numpy()
channel_features = channel_features.to_numpy()
keyword_features = keyword_features.to_numpy()
time_features = time_features.to_numpy()
creative_features = creative_features.to_numpy()
template_features = template_features.to_numpy()
landing_page_features = landing_page_features.to_numpy()

In [24]:
from torch_geometric.data import HeteroData

# Initialize a HeteroData object
hetero_data = HeteroData()

# Add nodes and features (assuming you have features for each type of node)
hetero_data['campaign'].x = torch.tensor(campaign_features, dtype=torch.float)
hetero_data['platform'].x = torch.tensor(platform_features, dtype=torch.float)
hetero_data['advertiser'].x = torch.tensor(advertiser_features, dtype=torch.float)
hetero_data['network'].x = torch.tensor(network_features, dtype=torch.float)
hetero_data['channel'].x = torch.tensor(channel_features, dtype=torch.float)
hetero_data['keyword'].x = torch.tensor(keyword_features, dtype=torch.float)
hetero_data['time'].x = torch.tensor(time_features, dtype=torch.float)
hetero_data['creative'].x = torch.tensor(creative_features, dtype=torch.float)
hetero_data['template'].x = torch.tensor(template_features, dtype=torch.float)
hetero_data['landing_page'].x = torch.tensor(landing_page_features, dtype=torch.float)

# Add edge indices
hetero_data['campaign', 'to', 'platform'].edge_index = edge_index_campaign_platform
hetero_data['campaign', 'to', 'advertiser'].edge_index = edge_index_campaign_advertiser
hetero_data['campaign', 'to', 'network'].edge_index = edge_index_campaign_network
hetero_data['campaign', 'to', 'channel'].edge_index = edge_index_campaign_channel
hetero_data['campaign', 'to', 'keyword'].edge_index = edge_index_campaign_keyword
hetero_data['campaign', 'to', 'time'].edge_index = edge_index_campaign_time
hetero_data['platform', 'to', 'channel'].edge_index = edge_index_platform_channel
hetero_data['platform', 'to', 'time'].edge_index = edge_index_platform_time
hetero_data['platform', 'to', 'keyword'].edge_index = edge_index_campaign_keyword
hetero_data['creative', 'to', 'campaign'].edge_index = edge_index_creative_campaign
hetero_data['creative', 'to', 'template'].edge_index = edge_index_creative_template
hetero_data['campaign', 'to', 'landing_page'].edge_index = edge_index_campaign_landingpage

# (Optional) Add edge attributes if available
# Example: hetero_data['campaign', 'to', 'platform'].edge_attr = torch.randn(edge_index_campaign_platform.size(1), edge_attr_dim)


In [26]:
print(hetero_data)

HeteroData(
  campaign={ x=[69200, 58] },
  platform={ x=[69200, 4] },
  advertiser={ x=[69200, 51] },
  network={ x=[69200, 1] },
  channel={ x=[69200, 6] },
  keyword={ x=[69200, 50] },
  time={ x=[69200, 3] },
  creative={ x=[69200, 2] },
  template={ x=[69200, 1] },
  landing_page={ x=[69200, 50] },
  (campaign, to, platform)={ edge_index=[2, 69200] },
  (campaign, to, advertiser)={ edge_index=[2, 69200] },
  (campaign, to, network)={ edge_index=[2, 69200] },
  (campaign, to, channel)={ edge_index=[2, 69200] },
  (campaign, to, keyword)={ edge_index=[2, 69200] },
  (campaign, to, time)={ edge_index=[2, 69200] },
  (platform, to, channel)={ edge_index=[2, 69200] },
  (platform, to, time)={ edge_index=[2, 69200] },
  (platform, to, keyword)={ edge_index=[2, 69200] },
  (creative, to, campaign)={ edge_index=[2, 69200] },
  (creative, to, template)={ edge_index=[2, 69200] },
  (campaign, to, landing_page)={ edge_index=[2, 69200] }
)


In [27]:
print(hetero_data['campaign'].x.shape)  # Shape of campaign features
print(hetero_data['campaign', 'to', 'platform'].edge_index.shape)  # Shape of campaign-to-platform edges

torch.Size([69200, 58])
torch.Size([2, 69200])


In [28]:
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv

class HeterogeneousGNN(torch.nn.Module):
    def __init__(self, metadata):
        super().__init__()
        # HeteroConv layers to aggregate messages for each edge type
        self.conv1 = HeteroConv({
            ('campaign', 'to', 'platform'): SAGEConv((-1, -1), 32),
            ('campaign', 'to', 'advertiser'): SAGEConv((-1, -1), 32),
            ('campaign', 'to', 'network'): SAGEConv((-1, -1), 32),
            ('campaign', 'to', 'channel'): SAGEConv((-1, -1), 32),
            ('campaign', 'to', 'keyword'): SAGEConv((-1, -1), 32),
            ('campaign', 'to', 'time'): SAGEConv((-1, -1), 32),
            ('platform', 'to', 'channel'): SAGEConv((-1, -1), 32),
            ('campaign', 'to', 'network'): SAGEConv((-1, -1), 32),
        }, aggr='mean')  # Aggregation method

        self.conv2 = HeteroConv({
            ('campaign', 'to', 'platform'): SAGEConv((32, 32), 16),
            ('campaign', 'to', 'advertiser'): SAGEConv((32, 32), 16),
            ('campaign', 'to', 'network'): SAGEConv((32, 32), 16),
            # Add other edge types here
        }, aggr='mean')  # Aggregation method

    def forward(self, x_dict, edge_index_dict):
        # First layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}  # Apply ReLU

        # Second layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict


In [29]:
train_mask = torch.rand(hetero_data['campaign'].x.size(0)) < 0.8  # 80% training
val_mask = ~train_mask
hetero_data['campaign'].train_mask = train_mask
hetero_data['campaign'].val_mask = val_mask


In [30]:
import torch.optim as optim

model = HeterogeneousGNN(metadata=hetero_data.metadata())
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()  # Use appropriate loss for your task

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(hetero_data.x_dict, hetero_data.edge_index_dict)
    
    # Assume you are predicting something for the 'campaign' node type
    loss = criterion(out['campaign'][hetero_data['campaign'].train_mask], 
                     labels[hetero_data['campaign'].train_mask])
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch}, Loss: {loss.item()}")




AttributeError: 'NoneType' object has no attribute 'dim'

In [None]:
model.eval()
with torch.no_grad():
    out = model(hetero_data.x_dict, hetero_data.edge_index_dict)
    # Evaluation metrics on test data
    test_acc = (out['campaign'][test_mask].argmax(dim=-1) == labels[test_mask]).sum() / test_mask.sum()
    print(f"Test Accuracy: {test_acc}")
