In [56]:
import pandas as pd

# Load your raw dataset (assuming it's a CSV)
df = pd.read_csv('../Datasets/processed_data.csv')
print(df.head())


   campaign_item_id  no_of_days  ext_service_id  creative_id  template_id  \
0              2733    0.059322             128         1000         90.0   
1              2733    0.067797              16         1000         90.0   
2              2733    0.076271             128         1000         90.0   
3              2733    0.084746             128         1000         90.0   
4              2733    0.093220               4         1000         90.0   

   advertiser_id  network_id  channel_id  campaign_budget_usd  clicks  ...  \
0           4756         190          32             0.201396       8  ...   
1           4756         190           8             0.201396      44  ...   
2           4756         190           8             0.201396      32  ...   
3           4756         190          64             0.201396      48  ...   
4           4756         190          32             0.201396      20  ...   

   advertiser_emb_40  advertiser_emb_41  advertiser_emb_42  advertis

### Getting all the relevent columns to the relevent nodes

In [65]:
# Campaign Node - Collect features for the campaign node
campaign_features = df[['campaign_item_id'] + [col for col in df.columns if col.startswith('zone_')] + [col for col in df.columns if col.startswith('search_tag_emb_')]]

# Platform Node - Collect features for the platform node 
platform_features = df[['ext_service_id'] + [col for col in df.columns if col.startswith('plateform_')]]

# Advertiser Node - Collect features for the advertiser node 
advertiser_features = df[['advertiser_id'] + [col for col in df.columns if col.startswith('advertiser_emb_')]]

# Creative Node - Collect features for the creative node 
creative_features = df[['creative_id','creative_dimension']]

# Template Node - Collect features for the template node
template_features = df[['template_id']]

# Channel Node - Collect features for the channel node
channel_features = df[[col for col in df.columns if col.startswith('channel_')]]

# Network Node - Collect features for the network node
network_features = df[['network_id']]

# Landing Page Node - Collect features for the landing page node 
landing_page_features = df[[col for col in df.columns if col.startswith('landing_page_emb_')]]

# Time Node - Collect features for the time node (day_of_week, month, day_of_year, etc.)
time_features = df[['day_of_week', 'month', 'day_of_year']]

# Keyword Node - Collect features for the keyword node (keyword embeddings)
keyword_features = df[[col for col in df.columns if col.startswith('keyword_')]]

print(template_features)

       template_id
0             90.0
1             90.0
2             90.0
3             90.0
4             90.0
...            ...
69195         90.0
69196         90.0
69197         90.0
69198         90.0
69199         90.0

[69200 rows x 1 columns]


#### Creating id's for nodes which dont have id

In [58]:
# To uniquely identify each keyword embedding, we'll hash the rows
df['keyword_id'] = keyword_features.apply(lambda row: hash(tuple(row)), axis=1)

In [59]:
# To uniquely identify each keyword embedding, we'll hash the rows
df['landing_page_id'] = landing_page_features.apply(lambda row: hash(tuple(row)), axis=1)

In [60]:
# Create a unique identifier for each time combination
df['time_id'] = df['day_of_week'].astype(str) + '_' + df['month'].astype(str) + '_' + df['day_of_year'].astype(str)

In [66]:
print(type(channel_features['channel_id']))


<class 'pandas.core.series.Series'>


In [67]:
print(channel_features.columns)


Index(['channel_id', 'channel_Display', 'channel_Mobile', 'channel_Search',
       'channel_Social', 'channel_Video'],
      dtype='object')


#### Building the graph structure

In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

# Creating the Nodes    
campaign = torch.tensor(campaign_features.values, dtype=torch.float)  # node features for Campaign
platform = torch.tensor(platform_features.values, dtype=torch.float)  # node features for Platform
advertiser = torch.tensor(advertiser_features.values, dtype=torch.float)  # node features for Advertiser
creative = torch.tensor(creative_features.values, dtype=torch.float)  # node features for Creative
template = torch.tensor(template_features.values, dtype=torch.float) # node features for  template
channel = torch.tensor(channel_features.values, dtype=torch.float)  # node features for channel
time = torch.tensor(time_features.values, dtype=torch.float)   # node features for Time
keyword = torch.tensor(keyword_features.values, dtype=torch.float)
network = torch.tensor(network_features.values, dtype=torch.float)  # node features for Network
landing_page = torch.tensor(landing_page_features.values, dtype=torch.float)

# MAKING CONNECTION BETWEEN NODES
# Mapping IDs to indices for each node type
campaign_id_to_index = {cid: idx for idx, cid in enumerate(campaign_features['campaign_item_id'].unique())}
platform_id_to_index = {pid: idx for idx, pid in enumerate(platform_features['ext_service_id'].unique())}
advertiser_id_to_index = {aid: idx for idx, aid in enumerate(advertiser_features['advertiser_id'].unique())}
network_id_to_index = {nid: idx for idx, nid in enumerate(network_features['network_id'].unique())}
creative_id_to_index = {crid: idx for idx, crid in enumerate(creative_features['creative_id'].unique())}
template_id_to_index = {tid: idx for idx, tid in enumerate(template_features['template_id'].unique())}
channel_id_to_index = {chid: idx for idx, chid in enumerate(channel_features['channel_id'].unique())}
keyword_id_to_index = {kid: idx for idx, kid in enumerate(df['keyword_id'].unique())}
landing_page_id_to_index = {lid: idx for idx, lid in enumerate(df['landing_page_id'].unique())}
time_id_to_index = {tid: idx for idx, tid in enumerate(df['time_id'].unique())}


# CREATING EDGES

# Campaign -> Platform
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['ext_service_id'].map(platform_id_to_index).values
edge_index_campaign_platform = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Advertiser
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['advertiser_id'].map(advertiser_id_to_index).values
edge_index_campaign_advertiser = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Network
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['network_id'].map(network_id_to_index).values
edge_index_campaign_network = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Channel
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['channel_id'].map(channel_id_to_index).values
edge_index_campaign_channel = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Keyword
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['keyword_id'].map(keyword_id_to_index).values
edge_index_campaign_keyword = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> Time
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['time_id'].map(time_id_to_index).values
edge_index_campaign_time = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Platform -> Channel
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['channel_id'].map(channel_id_to_index).values
edge_index_platform_channel = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Platform -> Time
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['time_id'].map(time_id_to_index).values
edge_index_platform_time = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Plateform -> Keyword
source_nodes = df['ext_service_id'].map(platform_id_to_index).values
target_nodes = df['keyword_id'].map(keyword_id_to_index).values
edge_index_campaign_keyword = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Creative -> Campaign
source_nodes = df['creative_id'].map(creative_id_to_index).values
target_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
edge_index_creative_campaign = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Creative -> Template
source_nodes = df['creative_id'].map(creative_id_to_index).values
target_nodes = df['template_id'].map(template_id_to_index).values
edge_index_creative_template = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Campaign -> LandingPage
source_nodes = df['campaign_item_id'].map(campaign_id_to_index).values
target_nodes = df['landing_page_id'].map(landing_page_id_to_index).values
edge_index_campaign_landingpage = torch.tensor([source_nodes, target_nodes], dtype=torch.long)



In [77]:
edge_features_campaign_platform = df[['no_of_days', 'media_cost_usd'] + [col for col in df.columns if col.startswith('weekday_week_')]]
edge_features_campaign_channel = df[['media_cost_usd','normalized_impressions']]
edge_features_campaign_clicks = df[['clicks','campaign_budget_usd']]


       clicks  campaign_budget_usd
0           8             0.201396
1          44             0.201396
2          32             0.201396
3          48             0.201396
4          20             0.201396
...       ...                  ...
69195      12             0.113102
69196       7             0.113102
69197      25             0.113102
69198      11             0.113102
69199       3             0.113102

[69200 rows x 2 columns]


In [82]:
print(campaign.shape)
print(platform.shape)
print(advertiser.shape)
print(creative.shape)
print(time.shape)
print(channel.shape)
print(network.shape)
print(landing_page.shape)
print(keyword.shape)



torch.Size([69200, 58])
torch.Size([69200, 4])
torch.Size([69200, 51])
torch.Size([69200, 2])
torch.Size([69200, 3])
torch.Size([69200, 6])
torch.Size([69200, 1])
torch.Size([69200, 50])
torch.Size([69200, 51])


In [83]:
import torch
import torch.nn.functional as F

# Define maximum feature dimension (based on the largest tensor)
max_dim = max(campaign.shape[1], platform.shape[1], advertiser.shape[1], creative.shape[1], 
              time.shape[1], channel.shape[1], network.shape[1], landing_page.shape[1], keyword.shape[1])

def pad_tensor(tensor, max_dim):
    """Pads tensor to match max_dim."""
    current_dim = tensor.shape[1]
    if current_dim < max_dim:
        padding_size = max_dim - current_dim
        # Pad the tensor with zeros
        tensor = F.pad(tensor, (0, padding_size), value=0)
    return tensor

# Padding the feature tensors
campaign = pad_tensor(campaign, max_dim)
platform = pad_tensor(platform, max_dim)
advertiser = pad_tensor(advertiser, max_dim)
creative = pad_tensor(creative, max_dim)
time = pad_tensor(time, max_dim)
channel = pad_tensor(channel, max_dim)
network = pad_tensor(network, max_dim)
landing_page = pad_tensor(landing_page, max_dim)
keyword = pad_tensor(keyword, max_dim)

# Check that all tensors have the same dimension
print(campaign.shape)
print(platform.shape)
print(advertiser.shape)
print(creative.shape)
print(time.shape)
print(channel.shape)
print(network.shape)
print(landing_page.shape)
print(keyword.shape)

# Now you can safely concatenate them
node_features = torch.cat([campaign, platform, advertiser, creative, time, channel, network, landing_page, keyword], dim=0)

print(node_features.shape)


torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([69200, 58])
torch.Size([622800, 58])


In [87]:
import torch
import torch.nn.functional as F

# Function to pad edge features to the maximum number of columns
def pad_edge_features(*edge_features):
    max_columns = max([feature.shape[1] for feature in edge_features])  # Find the max column size
    padded_edge_features = []
    
    for feature in edge_features:
        if feature.shape[1] < max_columns:
            # Padding with zeros (0) to match the max column size
            padding_size = max_columns - feature.shape[1]
            padded_feature = F.pad(feature, (0, padding_size), value=0)  # Pad only along the last dimension (columns)
            padded_edge_features.append(padded_feature)
        else:
            padded_edge_features.append(feature)
    
    return padded_edge_features

# Convert edge features to tensors if they are DataFrames or Series
edge_attr_campaign_platform = torch.tensor(edge_features_campaign_platform.values, dtype=torch.float) if isinstance(edge_features_campaign_platform, pd.DataFrame) else edge_attr_campaign_platform
edge_attr_campaign_channel = torch.tensor(edge_features_campaign_channel.values, dtype=torch.float) if isinstance(edge_features_campaign_channel, pd.DataFrame) else edge_features_campaign_channel
edge_attr_campaign_clicks = torch.tensor(edge_features_campaign_clicks.values, dtype=torch.float) if isinstance(edge_features_campaign_clicks, pd.DataFrame) else edge_features_campaign_clicks

# Pad edge features to have the same number of columns
edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks = pad_edge_features(
    edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks
)

# Concatenate edge features after padding
edge_attr = torch.cat([edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks], dim=0)

# Check the shape of concatenated edge features
print(edge_attr.shape)


torch.Size([207600, 4])


In [88]:
from torch_geometric.data import Data

# Define edge_index (source and target nodes)
edge_index = torch.cat([edge_index_campaign_platform, edge_index_campaign_advertiser, edge_index_creative_campaign,
                        edge_index_campaign_time, edge_index_campaign_channel, edge_index_campaign_network,
                        edge_index_campaign_landingpage, edge_index_campaign_keyword], dim=1)  # Add all edge indices

# Define edge attributes (edge features)
edge_attr = torch.cat([edge_attr_campaign_platform, edge_attr_campaign_channel, edge_attr_campaign_clicks ], dim=0)

# Combine node features into one tensor
node_features = torch.cat([campaign, platform, advertiser, creative, time, channel, network, landing_page, keyword], dim=0)

# Create the data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)


In [89]:
import torch.nn as nn
from torch_geometric.nn import GCNConv

class GNNModel(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(in_channels, 64)  # First GCN layer
        self.conv2 = GCNConv(64, out_channels)  # Second GCN layer

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr)  # Apply the first GCN layer
        x = x.relu()  # Apply ReLU activation
        x = self.conv2(x, edge_index, edge_attr)  # Apply the second GCN layer
        return x  # Output node embeddings (or logits)


In [92]:
import torch
import torch.optim as optim
import torch.nn as nn

# Initialize the model
model = GNNModel(in_channels=node_features.shape[1], out_channels=1)  # Single output for regression (number of clicks)

# Define optimizer and loss function for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()  # Use Mean Squared Error (MSE) for regression

# Training loop
num_epochs = 100  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    out = model(data)  # Pass the data to the model
    
    # Assuming you're predicting number of clicks, use the labels in data.y
    loss = criterion(out.squeeze(), data.y)  # 'data.y' contains the target labels (number of clicks)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    # Print the loss every few epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')


IndexError: The shape of the mask [553600] at index 0 does not match the shape of the indexed tensor [207600, 4] at index 0

In [93]:
print(f"Shape of tensor: {tensor.shape}")
print(f"Shape of mask: {mask.shape}")

NameError: name 'tensor' is not defined