# Temporal Graph 

## Basic Structure

Bipartite Graph
Two Types Of Nodes:
User Nodes (This will be CustomerId)- as of now we will not be adding features to this 

Product Nodes (This will be Description)- this will have a feature will be the unit price of the product

Timestep- a day 

Edge weight- Quantity 
Edge Attribute- Timestamp (Invoice Date)

So we will essentially have a quadruple like standard tkgs do
(U, E, P, T): User, Edge (Qty), Product, Timestamp




In [10]:
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
import os

In [11]:
uk_df=pd.read_csv('data/uk_retail_data.csv')

### Preprocessing

In [12]:
# Remove missing values
initial_rows = len(uk_df)
uk_df = uk_df.dropna(subset=['CustomerID', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice'])
print(f"Removed {initial_rows - len(uk_df)} rows with missing values")

Removed 133600 rows with missing values


In [13]:
# Remove negative quantities
uk_df = uk_df[uk_df['Quantity'] > 0]

In [14]:
# Convert InvoiceDate to datetime
uk_df['InvoiceDate'] = pd.to_datetime(uk_df['InvoiceDate'])

In [6]:
uk_df

Unnamed: 0,InvoiceNo,Description,Quantity,InvoiceDate,UnitPrice,CustomerID
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0
1,536365,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0
2,536365,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0
...,...,...,...,...,...,...
495473,581585,FAIRY TALE COTTAGE NIGHT LIGHT,12,2011-12-09,1.95,15804.0
495474,581586,LARGE CAKE STAND HANGING STRAWBERY,8,2011-12-09,2.95,13113.0
495475,581586,SET OF 3 HANGING OWLS OLLIE BEAK,24,2011-12-09,1.25,13113.0
495476,581586,RED RETROSPOT ROUND CAKE TINS,24,2011-12-09,8.95,13113.0


In [17]:
class TemporalKnowledgeGraphBuilder:
    def __init__(self, df):
        """
        Initialize TKG builder with retail dataset
        
        Args:
            df: DataFrame with columns [InvoiceNo, Description, Quantity, InvoiceDate, UnitPrice, CustomerID]
        """
        self.df = df.copy()
        self.user_to_id = {}
        self.product_to_id = {}
        self.id_to_user = {}
        self.id_to_product = {}
        
        # TKG components
        self.quadruples = []  # (user_id, edge_weight, product_id, timestamp)
        self.user_features = None
        self.product_features = None
        
    def create_node_mappings(self):
        """Create bidirectional mappings between entities and IDs"""
        print("Creating node mappings...")
        
        # Get unique users and products
        unique_users = self.df['CustomerID'].unique()
        unique_products = self.df['Description'].unique()
        
        # Create user mappings
        self.user_to_id = {user: idx for idx, user in enumerate(unique_users)}
        self.id_to_user = {idx: user for user, idx in self.user_to_id.items()}
        
        # Create product mappings (offset by number of users for bipartite structure)
        offset = len(unique_users)
        self.product_to_id = {product: idx + offset for idx, product in enumerate(unique_products)}
        self.id_to_product = {idx: product for product, idx in self.product_to_id.items()}
        
        print(f"Created mappings for {len(unique_users)} users and {len(unique_products)} products")
        
    def create_node_features(self):
        """Create node features with better scaling"""
        print("Creating node features...")
        
        num_users = len(self.user_to_id)
        num_products = len(self.product_to_id)
        
        # User features: For now, just use random features or purchase statistics
        user_stats = self.df.groupby('CustomerID').agg({
            'Quantity': 'sum',
            'UnitPrice': 'mean',
            'InvoiceNo': 'nunique'  # Number of unique invoices
        }).reset_index()
        
        # Initialize user features matrix
        self.user_features = np.zeros((num_users, 3))  # [total_qty, avg_price, num_invoices]
        
        for _, row in user_stats.iterrows():
            user_idx = self.user_to_id[row['CustomerID']]
            self.user_features[user_idx] = [
                row['Quantity'],
                row['UnitPrice'],
                row['InvoiceNo']
            ]
        
        # Use min-max scaling instead of z-score to avoid very small values
        from sklearn.preprocessing import MinMaxScaler
        user_scaler = MinMaxScaler(feature_range=(0.1, 1.0))  # Avoid zeros
        self.user_features = user_scaler.fit_transform(self.user_features)
        
        # Product features: Unit price and popularity
        product_stats = self.df.groupby('Description').agg({
            'UnitPrice': 'mean',
            'Quantity': 'sum',
            'CustomerID': 'nunique'  # Number of unique customers
        }).reset_index()
        
        # Initialize product features matrix
        self.product_features = np.zeros((num_products, 3))  # [avg_price, total_sold, num_customers]
        
        for _, row in product_stats.iterrows():
            product_idx = self.product_to_id[row['Description']] - len(self.user_to_id)  # Remove offset for indexing
            self.product_features[product_idx] = [
                row['UnitPrice'],
                row['Quantity'],
                row['CustomerID']
            ]
        
        # Use min-max scaling for products too
        product_scaler = MinMaxScaler(feature_range=(0.1, 1.0))
        self.product_features = product_scaler.fit_transform(self.product_features)
        
        print(f"User features shape: {self.user_features.shape}")
        print(f"Product features shape: {self.product_features.shape}")

    def create_quadruples(self):
        """Create TKG quadruples with scaled quantities"""
        print("Creating temporal quadruples...")
        
        self.quadruples = []
        quantities = self.df['Quantity'].values
        
        # Scale quantities to reasonable range (log transform + min-max scaling)
        log_quantities = np.log1p(quantities)  # log(1 + x) to handle zeros
        from sklearn.preprocessing import MinMaxScaler
        qty_scaler = MinMaxScaler(feature_range=(1.0, 10.0))  # Scale to meaningful range
        scaled_quantities = qty_scaler.fit_transform(log_quantities.reshape(-1, 1)).flatten()
        
        # Store scaler for inverse transform if needed
        self.quantity_scaler = qty_scaler
        self.log_transform_used = True
        
        for i, (_, row) in enumerate(self.df.iterrows()):
            user_id = self.user_to_id[row['CustomerID']]
            product_id = self.product_to_id[row['Description']]
            quantity = scaled_quantities[i]  # Use scaled quantity
            timestamp = row['InvoiceDate']
            
            # Create quadruple: (subject, relation_weight, object, time)
            quadruple = (user_id, quantity, product_id, timestamp)
            self.quadruples.append(quadruple)
        
        # Sort by timestamp for temporal consistency
        self.quadruples.sort(key=lambda x: x[3])
        
        print(f"Created {len(self.quadruples)} temporal quadruples")
        print(f"Quantity range: {min(q[1] for q in self.quadruples):.3f} to {max(q[1] for q in self.quadruples):.3f}")
        
    def get_pytorch_geometric_format(self):
        """Convert TKG to PyTorch Geometric format"""
        
        # Create edge index (bipartite graph structure)
        edge_index = []
        edge_attr = []  # [quantity, timestamp_encoded]
        
        # Convert timestamps to numerical format (days since first transaction)
        min_date = min(q[3] for q in self.quadruples)
        
        for user_id, quantity, product_id, timestamp in self.quadruples:
            # Add edge from user to product
            edge_index.append([user_id, product_id])
            
            # Encode timestamp as days since start
            days_since_start = (timestamp - min_date).days
            edge_attr.append([quantity, days_since_start])
        
        # Convert to tensors
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)
        
        # Combine user and product features
        all_node_features = np.vstack([self.user_features, self.product_features])
        node_features = torch.tensor(all_node_features, dtype=torch.float)
        
        return {
            'edge_index': edge_index,
            'edge_attr': edge_attr,
            'x': node_features,
            'num_users': len(self.user_to_id),
            'num_products': len(self.product_to_id),
            'quadruples': self.quadruples
        }
    
    def analyze_temporal_patterns(self):
        """Analyze temporal patterns in the data"""
        
        # Convert quadruples to DataFrame for analysis
        quad_df = pd.DataFrame(self.quadruples, columns=['user_id', 'quantity', 'product_id', 'timestamp'])
        
        # Temporal distribution
        quad_df['hour'] = quad_df['timestamp'].dt.hour
        quad_df['day_of_week'] = quad_df['timestamp'].dt.dayofweek
        quad_df['month'] = quad_df['timestamp'].dt.month
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Transactions by hour
        hour_counts = quad_df['hour'].value_counts().sort_index()
        axes[0, 0].bar(hour_counts.index, hour_counts.values)
        axes[0, 0].set_title('Transactions by Hour of Day')
        axes[0, 0].set_xlabel('Hour')
        axes[0, 0].set_ylabel('Number of Transactions')
        
        # Transactions by day of week
        day_counts = quad_df['day_of_week'].value_counts().sort_index()
        day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        axes[0, 1].bar([day_names[i] for i in day_counts.index], day_counts.values)
        axes[0, 1].set_title('Transactions by Day of Week')
        axes[0, 1].set_ylabel('Number of Transactions')
        
        # Transactions over time
        daily_counts = quad_df.groupby(quad_df['timestamp'].dt.date).size()
        axes[1, 0].plot(daily_counts.index, daily_counts.values)
        axes[1, 0].set_title('Daily Transaction Volume')
        axes[1, 0].set_xlabel('Date')
        axes[1, 0].set_ylabel('Number of Transactions')
        axes[1, 0].tick_params(axis='x', rotation=45)
        
        # Quantity distribution
        axes[1, 1].hist(quad_df['quantity'], bins=50, alpha=0.7)
        axes[1, 1].set_title('Distribution of Purchase Quantities')
        axes[1, 1].set_xlabel('Quantity')
        axes[1, 1].set_ylabel('Frequency')
        
        plt.tight_layout()
        plt.show()
        
        return quad_df
    
    def build_tkg(self):
        """Main method to build the complete TKG"""
        print("Building Temporal Knowledge Graph...")
        print("=" * 50)
        
        self.create_node_mappings()
        self.create_node_features()
        self.create_quadruples()
        
        print("=" * 50)
        print("TKG Construction Complete!")
        print(f"Graph Statistics:")
        print(f"  - Users: {len(self.user_to_id)}")
        print(f"  - Products: {len(self.product_to_id)}")
        print(f"  - Temporal Edges: {len(self.quadruples)}")
        print(f"  - Time Range: {min(q[3] for q in self.quadruples)} to {max(q[3] for q in self.quadruples)}")
        return self.get_pytorch_geometric_format()
        
    def save_tkg(self, filepath='tkg_dataset'):
        """
        Save the complete TKG to disk for future use
        
        Args:
            filepath: Base path for saving files (without extension)
        """
        print(f"Saving TKG to {filepath}...")
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
        
        # Save all components
        tkg_data = {
            'quadruples': self.quadruples,
            'user_to_id': self.user_to_id,
            'product_to_id': self.product_to_id,
            'id_to_user': self.id_to_user,
            'id_to_product': self.id_to_product,
            'user_features': self.user_features,
            'product_features': self.product_features,
            'metadata': {
                'num_users': len(self.user_to_id),
                'num_products': len(self.product_to_id),
                'num_edges': len(self.quadruples),
                'created_at': datetime.now().isoformat()
            }
        }
        
        # Save as pickle (most efficient)
        with open(f'{filepath}.pkl', 'wb') as f:
            pickle.dump(tkg_data, f)
        
        # Also save mappings as JSON for human readability
        mappings = {
            'user_to_id': {str(k): v for k, v in self.user_to_id.items()},
            'product_to_id': {str(k): v for k, v in self.product_to_id.items()},
            'metadata': tkg_data['metadata']
        }
        
        with open(f'{filepath}_mappings.json', 'w') as f:
            json.dump(mappings, f, indent=2)
        
        # Save PyTorch format separately
        pyg_data = self.get_pytorch_geometric_format()
        torch.save(pyg_data, f'{filepath}_pyg.pt')
        
        print(f"TKG saved successfully!")
        print(f"  - Main data: {filepath}.pkl")
        print(f"  - Mappings: {filepath}_mappings.json") 
        print(f"  - PyG format: {filepath}_pyg.pt")
    
    @classmethod
    def load_tkg(cls, filepath='tkg_dataset'):
        """
        Load a previously saved TKG
        
        Args:
            filepath: Base path of saved files (without extension)
            
        Returns:
            TemporalKnowledgeGraphBuilder instance with loaded data
        """
        print(f"Loading TKG from {filepath}...")
        
        # Create empty instance
        instance = cls(pd.DataFrame())  # Empty df, will be overwritten
        
        # Load main data
        with open(f'{filepath}.pkl', 'rb') as f:
            tkg_data = pickle.load(f)
        
        # Restore all attributes
        instance.quadruples = tkg_data['quadruples']
        instance.user_to_id = tkg_data['user_to_id']
        instance.product_to_id = tkg_data['product_to_id']
        instance.id_to_user = tkg_data['id_to_user']
        instance.id_to_product = tkg_data['id_to_product']
        instance.user_features = tkg_data['user_features']
        instance.product_features = tkg_data['product_features']
        
        metadata = tkg_data['metadata']
        print(f"Loaded TKG created on {metadata['created_at']}")
        print(f"  - Users: {metadata['num_users']}")
        print(f"  - Products: {metadata['num_products']}")
        print(f"  - Temporal Edges: {metadata['num_edges']}")
        
        return instance
    
    def update_tkg(self, new_df, save_path=None):
        """
        Update existing TKG with new data (incremental updates)
        
        Args:
            new_df: New transaction data with same schema
            save_path: If provided, save updated TKG to this path
        """
        print("Updating TKG with new data...")
        
        # Preprocess new data
        new_df = new_df.dropna(subset=['CustomerID', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice'])
        new_df = new_df[new_df['Quantity'] > 0]
        new_df['InvoiceDate'] = pd.to_datetime(new_df['InvoiceDate'])
        
        new_quadruples = []
        new_users = set()
        new_products = set()
        
        for _, row in new_df.iterrows():
            customer_id = row['CustomerID']
            description = row['Description']
            
            # Handle new users
            if customer_id not in self.user_to_id:
                new_user_id = len(self.user_to_id)
                self.user_to_id[customer_id] = new_user_id
                self.id_to_user[new_user_id] = customer_id
                new_users.add(customer_id)
            
            # Handle new products
            if description not in self.product_to_id:
                new_product_id = len(self.user_to_id) + len(self.product_to_id)
                self.product_to_id[description] = new_product_id
                self.id_to_product[new_product_id] = description
                new_products.add(description)
            
            # Create quadruple
            user_id = self.user_to_id[customer_id]
            product_id = self.product_to_id[description]
            quadruple = (user_id, row['Quantity'], product_id, row['InvoiceDate'])
            new_quadruples.append(quadruple)
        
        # Add new quadruples and re-sort
        self.quadruples.extend(new_quadruples)
        self.quadruples.sort(key=lambda x: x[3])
        
        # Update features (this is simplified - you might want more sophisticated updates)
        if new_users or new_products:
            print("Recalculating features for updated graph...")
            # For simplicity, recalculate all features
            # In production, you'd want incremental feature updates
            combined_df = pd.concat([self.df, new_df]) if hasattr(self, 'df') else new_df
            self.df = combined_df
            self.create_node_features()
        
        print(f"Added {len(new_quadruples)} new edges")
        print(f"New users: {len(new_users)}, New products: {len(new_products)}")
        
        if save_path:
            self.save_tkg(save_path)
    
    @staticmethod
    def load_pytorch_format(filepath='tkg_dataset_pyg.pt'):
        """Quick load just the PyTorch Geometric format"""
        return torch.load(filepath, weights_only=False)

In [18]:
# Example usage with persistence:

tkg_builder = TemporalKnowledgeGraphBuilder(uk_df)
tkg_data = tkg_builder.build_tkg()

# Save for future use
tkg_builder.save_tkg('graph/my_retail_tkg')
"""
# Later sessions - Just load the saved TKG
tkg_builder = TemporalKnowledgeGraphBuilder.load_tkg('my_retail_tkg')
tkg_data = tkg_builder.get_pytorch_geometric_format()

# Or quickly load just PyG format
pyg_data = TemporalKnowledgeGraphBuilder.load_pytorch_format('my_retail_tkg_pyg.pt')

# Update with new data
new_transactions = pd.read_csv('new_data.csv')
tkg_builder.update_tkg(new_transactions, save_path='my_retail_tkg_updated')

print("TKG ready for temporal recommendation modeling!")
"""

Building Temporal Knowledge Graph...
Creating node mappings...
Created mappings for 3921 users and 3844 products
Creating node features...
User features shape: (3921, 3)
Product features shape: (3844, 3)
Creating temporal quadruples...
Created 354345 temporal quadruples
Quantity range: 1.000 to 10.000
TKG Construction Complete!
Graph Statistics:
  - Users: 3921
  - Products: 3844
  - Temporal Edges: 354345
  - Time Range: 2010-12-01 00:00:00 to 2011-12-09 00:00:00
Saving TKG to graph/my_retail_tkg...
TKG saved successfully!
  - Main data: graph/my_retail_tkg.pkl
  - Mappings: graph/my_retail_tkg_mappings.json
  - PyG format: graph/my_retail_tkg_pyg.pt


'\n# Later sessions - Just load the saved TKG\ntkg_builder = TemporalKnowledgeGraphBuilder.load_tkg(\'my_retail_tkg\')\ntkg_data = tkg_builder.get_pytorch_geometric_format()\n\n# Or quickly load just PyG format\npyg_data = TemporalKnowledgeGraphBuilder.load_pytorch_format(\'my_retail_tkg_pyg.pt\')\n\n# Update with new data\nnew_transactions = pd.read_csv(\'new_data.csv\')\ntkg_builder.update_tkg(new_transactions, save_path=\'my_retail_tkg_updated\')\n\nprint("TKG ready for temporal recommendation modeling!")\n'

In [16]:
tkg_builder = TemporalKnowledgeGraphBuilder.load_tkg('graph/my_retail_tkg')
tkg_data = tkg_builder.get_pytorch_geometric_format()

Loading TKG from graph/my_retail_tkg...


FileNotFoundError: [Errno 2] No such file or directory: 'graph/my_retail_tkg.pkl'

In [None]:
pyg_data = torch.load('graph/my_retail_tkg_pyg.pt', weights_only=False)