In [68]:
import pandas as pd
import networkx as nx
import torch
import numpy as np
from torch_geometric.data import Data

# Load datasets
train = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/train.csv")
users = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/users.csv")
user_friends = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/user_friends.csv")
events = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/events.csv")
event_attendees = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/event_attendees.csv")

# Debugging: Check if datasets are loaded correctly
print("Train dataset sample:\n", train.head())
print("Users dataset sample:\n", users.head())
print("User friends dataset sample:\n", user_friends.head())
print("Events dataset sample:\n", events.head())
print("Event attendees dataset sample:\n", event_attendees.head())

# Create a graph structure
G = nx.Graph()

# Add user nodes
for user_id in users['user_id']:
    G.add_node(user_id, type='user')

# Add event nodes
for event_id in events['event_id']:
    G.add_node(event_id, type='event')

# Debugging: Check if nodes are added correctly
print("Number of user nodes:", len([n for n, attr in G.nodes(data=True) if attr['type'] == 'user']))
print("Number of event nodes:", len([n for n, attr in G.nodes(data=True) if attr['type'] == 'event']))

# Add edges for user friendships
for _, row in user_friends.iterrows():
    user = row['user']
    friends = row['friends'].split()
    for friend in friends:
        if friend in G:  # Check if friend exists in the graph
            G.add_edge(user, friend, type='friendship')
        else:
            print(f"Warning: Friend {friend} not found in the graph.")

# Debugging: Check if friendship edges are added correctly
print("Number of friendship edges:", len([e for e in G.edges(data=True) if e[2]['type'] == 'friendship']))

# Add edges for event attendance
for _, row in event_attendees.iterrows():
    event_id = row['event_id']
    for user_id in row['yes'].split():
        if user_id in G:  # Check if user exists in the graph
            G.add_edge(user_id, event_id, type='attended')
        else:
            print(f"Warning: User {user_id} not found in the graph.")

# Debugging: Check if attendance edges are added correctly
print("Number of attendance edges:", len([e for e in G.edges(data=True) if e[2]['type'] == 'attended']))

# Create a mapping from node IDs to integer indices
node_id_to_idx = {node_id: idx for idx, node_id in enumerate(G.nodes())}

# Debugging: Check the mapping
print("Sample node ID to index mapping:", list(node_id_to_idx.items())[:10])

# Convert edges to integer indices
edges = []
for u, v in G.edges():
    try:
        edges.append((node_id_to_idx[u], node_id_to_idx[v]))
    except KeyError as e:
        print(f"Error: Node {e} not found in node_id_to_idx mapping.")

# Debugging: Check edges
print("Number of edges:", len(edges))
print("Sample edges:", edges[:10])  # Print first 10 edges

# Convert edges to PyTorch tensor
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

# Debugging: Check edge_index
print("Edge index shape:", edge_index.shape)
print("Sample edge index:", edge_index[:, :10])  # Print first 10 edges

# Example: Create random features for each node
num_nodes = len(G.nodes())
num_features = 10  # Number of features per node
node_features = np.random.rand(num_nodes, num_features)  # Random features for testing

# Convert to PyTorch tensor
x = torch.tensor(node_features, dtype=torch.float)

# Debugging: Check node features
print("Node features shape:", x.shape)
print("Sample node features:", x[:5])  # Print features for the first 5 nodes

# Example: Create random labels for each node (for testing)
labels = np.random.randint(0, 2, size=num_nodes)  # Binary labels (0 or 1)

# Convert to PyTorch tensor
y = torch.tensor(labels, dtype=torch.long)

# Debugging: Check labels
print("Labels shape:", y.shape)
print("Sample labels:", y[:10])  # Print labels for the first 10 nodes

# Create the Data object
data = Data(x=x, edge_index=edge_index, y=y)

# Debugging: Check the Data object
print("PyTorch Geometric Data object:\n", data)
print("Number of nodes:", data.num_nodes)
print("Number of edges:", data.num_edges)
print("Number of features:", data.num_features)

Train dataset sample:
     user  event  invited             timestamp
0  u_254   e_63        0  2025-03-22T00:48:19Z
1  u_642  e_380        0  2025-05-23T08:54:38Z
2  u_186   e_78        0  2025-05-05T07:20:01Z
3  u_642  e_462        0  2025-03-19T19:49:09Z
4   u_93  e_458        0  2025-03-27T04:59:10Z
Users dataset sample:
   user_id locale  birthyear  gender              joinedAt            location  \
0     u_0  en_US       1971  female  2024-01-26T01:29:03Z         New Stephen   
1     u_1  fr_FR       1978    male  2021-02-14T05:38:09Z         Johnsonstad   
2     u_2  en_US       2007  female  2021-10-25T06:27:01Z  New Christinemouth   
3     u_3  en_US       1975    male  2022-02-13T03:33:48Z   West Garrettville   
4     u_4  en_US       2005    male  2022-04-10T06:02:57Z          Walkerbury   

   timezone  
0      -219  
1       665  
2      -655  
3      -244  
4       746  
User friends dataset sample:
   user                                            friends
0  u_0  u_521

AttributeError: 'float' object has no attribute 'split'

In [59]:
from torch_geometric.nn import TransformerConv
import torch.nn.functional as F

class GraphTransformer(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes, heads=8):
        super(GraphTransformer, self).__init__()
        self.conv1 = TransformerConv(num_node_features, hidden_channels, heads=heads)
        self.conv2 = TransformerConv(hidden_channels * heads, hidden_channels, heads=heads)
        self.conv3 = TransformerConv(hidden_channels * heads, num_classes, heads=1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the Graph Transformer model
model = GraphTransformer(num_node_features=10, hidden_channels=32, num_classes=2, heads=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [60]:
from torch_geometric.data import DataLoader

# Split data into training and test sets
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:int(0.8 * data.num_nodes)] = True  # 80% training, 20% test
test_mask = ~train_mask

data.train_mask = train_mask
data.test_mask = test_mask

# Debugging: Check the split
print("Training nodes:", data.train_mask.sum().item())
print("Test nodes:", data.test_mask.sum().item())

Training nodes: 720
Test nodes: 180


In [61]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    correct = pred[data.test_mask] == data.y[data.test_mask]
    accuracy = int(correct.sum()) / int(data.test_mask.sum())
    return accuracy

# Training loop
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        accuracy = test()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Test Accuracy: {accuracy:.4f}')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (900x5 and 10x256)

In [5]:
# Set the model to evaluation mode
model.eval()

# Predict probabilities for all nodes
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred_probs = torch.exp(out)  # Convert log probabilities to probabilities

# Debugging: Check predictions
print("Sample predicted probabilities:", pred_probs[:5])

Sample predicted probabilities: tensor([[8.3819e-01, 1.6181e-01],
        [5.5904e-02, 9.4410e-01],
        [6.4056e-05, 9.9994e-01],
        [6.1925e-02, 9.3808e-01],
        [4.6900e-02, 9.5310e-01]])


In [6]:
def recommend_events(user_id, top_k=5):
    user_idx = node_id_to_idx[user_id]  # Convert user ID to index
    event_indices = [idx for idx, node_id in enumerate(G.nodes()) if G.nodes[node_id]['type'] == 'event']

    # Get predicted probabilities for events
    event_probs = pred_probs[event_indices, 1]  # Probability of being interested (class 1)

    # Sort events by probability
    top_event_indices = event_probs.argsort(descending=True)[:top_k]
    top_event_ids = [list(G.nodes())[idx] for idx in top_event_indices]

    return top_event_ids

# Example: Recommend events for a user
user_id = "qzb9nlUtAffT"  # Replace with a valid user ID from your dataset
recommended_events = recommend_events(user_id, top_k=5)
print(f"Recommended events for user {user_id}: {recommended_events}")

KeyError: 'qzb9nlUtAffT'

In [72]:
def precision_at_k(user_id, recommended_events, k=5):
    # Get ground truth (events the user is interested in)
    ground_truth = set([event_id for event_id in G.neighbors(user_id) if G.nodes[event_id]['type'] == 'event'])

    # Calculate precision@k
    relevant = set(recommended_events[:k]).intersection(ground_truth)
    return len(relevant) / k

# Example: Evaluate precision@5 for a user
precision = precision_at_k(user_id, recommended_events, k=5)
print(f"Precision@5 for user {user_id}: {precision:.4f}")

Precision@5 for user qzb9nlUtAffT: 0.0000


In [73]:
def evaluate_recommendations(k=5):
    precisions = []
    for user_id in users['user_id']:
        if user_id in node_id_to_idx:  # Ensure the user is in the graph
            recommended_events = recommend_events(user_id, top_k=k)
            precision = precision_at_k(user_id, recommended_events, k=k)
            precisions.append(precision)
    return np.mean(precisions)

# Evaluate the recommendation system
avg_precision = evaluate_recommendations(k=5)
print(f"Average Precision@5: {avg_precision:.4f}")

Average Precision@5: 0.0000


In [65]:
import pandas as pd
import networkx as nx
import torch
import numpy as np
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from torch_geometric.nn import TransformerConv
import torch.nn.functional as F
from torch_geometric.data import DataLoader

# Load datasets
train = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/train.csv")
users = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/users.csv")
user_friends = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/social_connections.csv")  # Corrected file name
events = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/events.csv")
event_attendees = pd.read_csv("C:/Users/Mahesh/OneDrive/Desktop/Hackathons/D2K/Dataset/event_attendees.csv")

# Debugging: Check if datasets are loaded correctly
print("Train dataset sample:\n", train.head())
print("Users dataset sample:\n", users.head())
print("User friends dataset sample:\n", user_friends.head())
print("Events dataset sample:\n", events.head())
print("Event attendees dataset sample:\n", event_attendees.head())

# Create a graph structure
G = nx.Graph()

# Add user nodes
for _, row in users.iterrows():
    G.add_node(row['user_id'], type='user', **row.to_dict())

# Add event nodes with available attributes
for _, row in events.iterrows():
    event_id = row['event_id']
    event_attrs = {
        'type': 'event',
        'user_id': row['user_id'],
        'start_time': row['start_time'],
        'city': row['city'],
        'state': row['state'],
        'zip': row['zip'],
        'country': row['country'],
        'lat': row['lat'],
        'lng': row['lng'],
        'popularity': row.get('count_other', 0)  # Use 'count_other' as a proxy for popularity
    }
    G.add_node(event_id, **event_attrs)

# Debugging: Check if nodes are added correctly
print("Number of user nodes:", len([n for n, attr in G.nodes(data=True) if attr['type'] == 'user']))
print("Number of event nodes:", len([n for n, attr in G.nodes(data=True) if attr['type'] == 'event']))

# Add edges for user friendships
for _, row in user_friends.iterrows():
    user_a = row['user_a']  # First user in the connection
    user_b = row['user_b']  # Second user in the connection
    connection_type = row['connection_type']  # Type of connection (e.g., friend, close_friend)
    connection_strength = row['connection_strength']  # Strength of the connection
    
    # Check if both users exist in the graph
    if user_a in G and user_b in G:
        G.add_edge(user_a, user_b, type='friendship', connection_type=connection_type, connection_strength=connection_strength)
    else:
        print(f"Warning: User {user_a} or {user_b} not found in the graph.")

# Debugging: Check if friendship edges are added correctly
print("Number of friendship edges:", len([e for e in G.edges(data=True) if e[2]['type'] == 'friendship']))

# Add edges for event attendance
for _, row in event_attendees.iterrows():
    event_id = row['event_id']
    for user_id in row['yes'].split():
        if user_id in G:  # Check if user exists in the graph
            G.add_edge(user_id, event_id, type='attended')
        else:
            print(f"Warning: User {user_id} not found in the graph.")

# Debugging: Check if attendance edges are added correctly
print("Number of attendance edges:", len([e for e in G.edges(data=True) if e[2]['type'] == 'attended']))

# Extract and display nodes
print("\nNodes in the graph:")
for node, attr in G.nodes(data=True):
    print(f"Node ID: {node}, Attributes: {attr}")

# Extract and display edges
print("\nEdges in the graph:")
for u, v, attr in G.edges(data=True):
    print(f"Edge: ({u}, {v}), Attributes: {attr}")

# Create a mapping from node IDs to integer indices
node_id_to_idx = {node_id: idx for idx, node_id in enumerate(G.nodes())}

# Debugging: Check the mapping
print("\nSample node ID to index mapping:", list(node_id_to_idx.items())[:10])

# Convert edges to integer indices
edges = []
for u, v in G.edges():
    try:
        edges.append((node_id_to_idx[u], node_id_to_idx[v]))
    except KeyError as e:
        print(f"Error: Node {e} not found in node_id_to_idx mapping.")

# Debugging: Check edges
print("\nNumber of edges:", len(edges))
print("Sample edges:", edges[:10])  # Print first 10 edges

# Convert edges to PyTorch tensor
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

# Debugging: Check edge_index
print("\nEdge index shape:", edge_index.shape)
print("Sample edge index:", edge_index[:, :10])  # Print first 10 edges

# Create meaningful node features
def create_node_features(G, node_id_to_idx):
    num_nodes = len(G.nodes())
    num_features = 5  # Number of features per node
    node_features = np.zeros((num_nodes, num_features))
    
    for node_id, idx in node_id_to_idx.items():
        node_data = G.nodes[node_id]
        if node_data['type'] == 'user':
            # Example features for users
            node_features[idx, 0] = node_data.get('social_affinity', 0)
            node_features[idx, 1] = len(list(G.neighbors(node_id)))  # Degree
            node_features[idx, 2] = node_data.get('birthyear', 1990)  # Birthyear
            node_features[idx, 3] = 1 if node_data.get('gender', 'M') == 'M' else 0  # Gender
            node_features[idx, 4] = len(node_data.get('interests', []))  # Number of interests
        elif node_data['type'] == 'event':
            # Example features for events (without category)
            node_features[idx, 0] = node_data.get('popularity', 0)  # Popularity
            node_features[idx, 1] = len(list(G.neighbors(node_id)))  # Number of attendees
            node_features[idx, 2] = 0  # Placeholder for future features
            node_features[idx, 3] = 0  # Placeholder for future features
            node_features[idx, 4] = 0  # Placeholder for future features
    
    # Normalize features
    scaler = StandardScaler()
    node_features = scaler.fit_transform(node_features)
    
    return node_features

# Create node features
node_features = create_node_features(G, node_id_to_idx)

# Convert to PyTorch tensor
x = torch.tensor(node_features, dtype=torch.float)

# Debugging: Check node features
print("\nNode features shape:", x.shape)
print("Sample node features:", x[:5])  # Print features for the first 5 nodes

# Example: Create labels for each node (for testing)
labels = np.zeros(len(G.nodes()))
for node_id, idx in node_id_to_idx.items():
    if G.nodes[node_id]['type'] == 'user':
        labels[idx] = 1 if len(list(G.neighbors(node_id))) > 0 else 0
    elif G.nodes[node_id]['type'] == 'event':
        labels[idx] = 1 if len(list(G.neighbors(node_id))) > 0 else 0

# Convert to PyTorch tensor
y = torch.tensor(labels, dtype=torch.long)

# Debugging: Check labels
print("\nLabels shape:", y.shape)
print("Sample labels:", y[:10])  # Print labels for the first 10 nodes

# Create the Data object
data = Data(x=x, edge_index=edge_index, y=y)

# Debugging: Check the Data object
print("\nPyTorch Geometric Data object:\n", data)
print("Number of nodes:", data.num_nodes)
print("Number of edges:", data.num_edges)
print("Number of features:", data.num_features)

class GraphTransformer(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes, heads=8):
        super(GraphTransformer, self).__init__()
        self.conv1 = TransformerConv(num_node_features, hidden_channels, heads=heads)
        self.conv2 = TransformerConv(hidden_channels * heads, hidden_channels, heads=heads)
        self.conv3 = TransformerConv(hidden_channels * heads, num_classes, heads=1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the Graph Transformer model
model = GraphTransformer(num_node_features=5, hidden_channels=32, num_classes=2, heads=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Split data into training and test sets
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:int(0.8 * data.num_nodes)] = True  # 80% training, 20% test
test_mask = ~train_mask

data.train_mask = train_mask
data.test_mask = test_mask

# Debugging: Check the split
print("Training nodes:", data.train_mask.sum().item())
print("Test nodes:", data.test_mask.sum().item())

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    correct = pred[data.test_mask] == data.y[data.test_mask]
    accuracy = int(correct.sum()) / int(data.test_mask.sum())
    return accuracy

# Training loop
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        accuracy = test()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Test Accuracy: {accuracy:.4f}')

# Set the model to evaluation mode
model.eval()

# Predict probabilities for all nodes
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred_probs = torch.exp(out)  # Convert log probabilities to probabilities

# Debugging: Check predictions
print("Sample predicted probabilities:", pred_probs[:5])

def recommend_events(user_id, top_k=5):
    user_idx = node_id_to_idx[user_id]  # Convert user ID to index
    event_indices = [idx for idx, node_id in enumerate(G.nodes()) if G.nodes[node_id]['type'] == 'event']

    # Get predicted probabilities for events
    event_probs = pred_probs[event_indices, 1]  # Probability of being interested (class 1)

    # Sort events by probability
    top_event_indices = event_probs.argsort(descending=True)[:top_k]
    top_event_ids = [list(G.nodes())[idx] for idx in top_event_indices]

    return top_event_ids

# Example: Recommend events for a user
user_id = "qzb9nlUtAffT"  # Replace with a valid user ID from your dataset
recommended_events = recommend_events(user_id, top_k=5)
print(f"Recommended events for user {user_id}: {recommended_events}")

def precision_at_k(user_id, recommended_events, k=5):
    # Get ground truth (events the user is interested in)
    ground_truth = set([event_id for event_id in G.neighbors(user_id) if G.nodes[event_id]['type'] == 'event'])

    # Calculate precision@k
    relevant = set(recommended_events[:k]).intersection(ground_truth)
    return len(relevant) / k

# Example: Evaluate precision@5 for a user
precision = precision_at_k(user_id, recommended_events, k=5)
print(f"Precision@5 for user {user_id}: {precision:.4f}")

def evaluate_recommendations(k=5):
    precisions = []
    for user_id in users['user_id']:
        if user_id in node_id_to_idx:  # Ensure the user is in the graph
            recommended_events = recommend_events(user_id, top_k=k)
            precision = precision_at_k(user_id, recommended_events, k=k)
            precisions.append(precision)
    return np.mean(precisions)

# Evaluate the recommendation system
avg_precision = evaluate_recommendations(k=5)
print(f"Average Precision@5: {avg_precision:.4f}")

# Check the distribution of the 'interested' column in the train dataset
print(train['interested'].value_counts())

Train dataset sample:
            user         event  social_influence  common_interests  \
0  LwlQ1Zv7TDId  Xz0aqXRdExtY          0.000000                 1   
1  YmJcYM9goaWk  0yfYHDNfifCN          0.000000                 0   
2  Nwgr1KF69q18  kHoGHOGaUFDd          0.000000                 0   
3  uFgY2JCp1wgi  CToggmgWlfjl          0.000000                 0   
4  8daecc1NTCzV  f5tLDjocYO8Y          1.365827                 1   

   days_until_event  user_social_affinity  event_popularity  \
0                65              0.103775                11   
1               340              0.286612                 8   
2               138              0.274383                 5   
3               171              0.276036                 9   
4               240              0.248358                16   

   creator_popularity  temporal_weight  interested  not_interested  
0                  12         0.897216           0               1  
1                  19         0.999988       

KeyError: 'qzb9nlUtAffT'