In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F
# PyTorch Geometric is a library built upon PyTorch for graph neural networks.
# You might need to install it: pip install torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool

# --- 1. Load the V1.3 Graph Component Files ---
data_dir = '../Data'
output_version = "V1.3"

try:
    print("--- Loading V1.3 Graph Components ---")
    df_cell_nodes = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_{output_version}.csv"))
    df_cell_edges = pd.read_csv(os.path.join(data_dir, f"graph_cell_edge_list_{output_version}.csv"))
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_{output_version}.csv"))
    df_sector_edges = pd.read_csv(os.path.join(data_dir, f"graph_sector_edge_list_{output_version}.csv"))
    df_hierarchy = pd.read_csv(os.path.join(data_dir, f"graph_hierarchy_map_{output_version}.csv"))
    print("All V1.3 graph component files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required V1.3 graph component file was not found. Please ensure the previous step ran successfully. Details: {e}")
    exit()

# --- 2. Pre-process Data for PyTorch Geometric ---
print("\n--- Pre-processing data for PyTorch Geometric ---")

# -- Cell Graph --
# Create a mapping from canonical cell ID string to a unique integer index
# This ensures a consistent ordering and allows us to reference nodes by integer IDs
all_cell_ids = df_cell_nodes['Canonical_Cell_ID'].unique()
cell_id_map = {name: i for i, name in enumerate(all_cell_ids)}
df_cell_nodes['node_idx'] = df_cell_nodes['Canonical_Cell_ID'].map(cell_id_map)

# Select only numeric features for the model, fill NaNs with 0
# NOTE: Categorical features would need to be converted to numbers (e.g., one-hot encoding) first
cell_numeric_features = df_cell_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
cell_x = torch.tensor(cell_numeric_features.values, dtype=torch.float)

# Create the edge index tensor from our edge list, using the integer map
df_cell_edges['source_idx'] = df_cell_edges['source'].map(cell_id_map)
df_cell_edges['target_idx'] = df_cell_edges['target'].map(cell_id_map)
cell_edge_index = torch.tensor(df_cell_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()

# Create the PyG Data object for the cell graph
cell_graph = Data(x=cell_x, edge_index=cell_edge_index)
print(f"Cell Graph created: {cell_graph}")

# -- Sector Graph --
# Create a mapping for sector IDs
all_sector_ids = df_sector_nodes['Canonical_Sector_ID'].unique()
sector_id_map = {name: i for i, name in enumerate(all_sector_ids)}
df_sector_nodes['node_idx'] = df_sector_nodes['Canonical_Sector_ID'].map(sector_id_map)

# Select numeric features
sector_numeric_features = df_sector_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
sector_x = torch.tensor(sector_numeric_features.values, dtype=torch.float)

# Create the edge index tensor
df_sector_edges['source_idx'] = df_sector_edges['source'].map(sector_id_map)
df_sector_edges['target_idx'] = df_sector_edges['target'].map(sector_id_map)
sector_edge_index = torch.tensor(df_sector_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()

# Create the PyG Data object for the sector graph
sector_graph = Data(x=sector_x, edge_index=sector_edge_index)
print(f"Sector Graph created: {sector_graph}")

# -- Hierarchy Map --
# This tensor maps each cell (by its index) to its parent sector (by its index)
# It's used for the pooling step between the GNN layers
df_hierarchy['cell_idx'] = df_hierarchy['Canonical_Cell_ID'].map(cell_id_map)
df_hierarchy['sector_idx'] = df_hierarchy['Canonical_Sector_ID'].map(sector_id_map)
df_hierarchy.dropna(subset=['cell_idx', 'sector_idx'], inplace=True)
# Ensure the hierarchy map is sorted by cell_idx to align with the cell feature tensor `cell_x`
df_hierarchy = df_hierarchy.sort_values('cell_idx').set_index('cell_idx')
# Reindex to match the full size of cell_x, filling missing cells if any
df_hierarchy = df_hierarchy.reindex(range(len(all_cell_ids)))
hierarchy_batch_vector = torch.tensor(df_hierarchy['sector_idx'].fillna(-1).values, dtype=torch.long) # Use -1 for unmapped cells
print(f"Hierarchy map created. Total cells mapped: {len(df_hierarchy.dropna())}")


# --- 3. Define the H-GNN Model Architecture ---
print("\n--- Defining the Hierarchical GNN Model ---")

class HierarchicalGAT(torch.nn.Module):
    def __init__(self, cell_in_features, cell_hidden_dim, cell_out_features, 
                 sector_in_features, sector_hidden_dim, sector_out_features, 
                 num_final_classes, heads=4):
        super(HierarchicalGAT, self).__init__()
        
        # --- Cell-Level GAT (Layer 2) ---
        self.cell_gat1 = GATConv(cell_in_features, cell_hidden_dim, heads=heads, dropout=0.6)
        self.cell_gat2 = GATConv(cell_hidden_dim * heads, cell_out_features, heads=1, concat=False, dropout=0.6)

        # --- Sector-Level GAT (Layer 1) ---
        # The input features will be the original sector features PLUS the aggregated cell features
        total_sector_in_features = sector_in_features + cell_out_features
        self.sector_gat1 = GATConv(total_sector_in_features, sector_hidden_dim, heads=heads, dropout=0.6)
        self.sector_gat2 = GATConv(sector_hidden_dim * heads, sector_out_features, heads=1, concat=False, dropout=0.6)

        # --- Final Classifier ---
        self.classifier = torch.nn.Linear(sector_out_features, num_final_classes)

    def forward(self, cell_data, sector_data, hierarchy_batch):
        cell_x, cell_edge_index = cell_data.x, cell_data.edge_index
        sector_x, sector_edge_index = sector_data.x, sector_data.edge_index

        # === Stage 1: Bottom-Up Processing (Cell to Sector) ===
        cell_x = F.elu(self.cell_gat1(cell_x, cell_edge_index))
        cell_x = self.cell_gat2(cell_x, cell_edge_index)
        
        # Aggregate cell embeddings to get a single vector per sector
        sector_aggregated_cell_features = global_mean_pool(cell_x, hierarchy_batch)

        # === Stage 2: Top-Down Processing (Sector to Sector) ===
        combined_sector_features = torch.cat([sector_data.x, sector_aggregated_cell_features], dim=1)
        
        sector_x = F.elu(self.sector_gat1(combined_sector_features, sector_edge_index))
        sector_x = self.sector_gat2(sector_x, sector_edge_index)
        
        # Final classification
        out = self.classifier(sector_x)
        
        return F.log_softmax(out, dim=1)


# --- 4. Instantiate the Model ---
# The dimensions are taken directly from the data we just processed
NUM_CELL_FEATURES = cell_graph.num_node_features
NUM_SECTOR_FEATURES = sector_graph.num_node_features
NUM_FINAL_CLASSES = 2 # Example: for binary classification (Overutilized vs. Not)

model = HierarchicalGAT(
    cell_in_features=NUM_CELL_FEATURES,
    cell_hidden_dim=32,          # Can be tuned
    cell_out_features=64,        # Size of embedding passed from cells to sectors
    sector_in_features=NUM_SECTOR_FEATURES,
    sector_hidden_dim=64,        # Can be tuned
    sector_out_features=32,      # Final embedding size for sectors
    num_final_classes=NUM_FINAL_CLASSES
)

print("\n--- H-GNN Model Architecture ---")
print(model)
print(f"\nModel is ready. It expects {NUM_CELL_FEATURES} input features for cells and {NUM_SECTOR_FEATURES} for sectors.")

--- Loading V1.3 Graph Components ---
All V1.3 graph component files loaded successfully.

--- Pre-processing data for PyTorch Geometric ---
Cell Graph created: Data(x=[2724, 34], edge_index=[2, 301103])
Sector Graph created: Data(x=[784, 6], edge_index=[2, 2922])
Hierarchy map created. Total cells mapped: 2637

--- Defining the Hierarchical GNN Model ---

--- H-GNN Model Architecture ---
HierarchicalGAT(
  (cell_gat1): GATConv(34, 32, heads=4)
  (cell_gat2): GATConv(128, 64, heads=1)
  (sector_gat1): GATConv(70, 64, heads=4)
  (sector_gat2): GATConv(256, 32, heads=1)
  (classifier): Linear(in_features=32, out_features=2, bias=True)
)

Model is ready. It expects 34 input features for cells and 6 for sectors.


In [2]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool

# --- 1. Load the V1.3 Graph Component Files ---
data_dir = '../Data'
output_version = "V1.3"

try:
    print("--- Loading V1.3 Graph Components ---")
    df_cell_nodes = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_{output_version}.csv"))
    df_cell_edges = pd.read_csv(os.path.join(data_dir, f"graph_cell_edge_list_{output_version}.csv"))
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_{output_version}.csv"))
    df_sector_edges = pd.read_csv(os.path.join(data_dir, f"graph_sector_edge_list_{output_version}.csv"))
    df_hierarchy = pd.read_csv(os.path.join(data_dir, f"graph_hierarchy_map_{output_version}.csv"))
    print("All V1.3 graph component files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required V1.3 graph component file was not found. Please ensure all component creation scripts ran successfully. Details: {e}")
    exit()

# --- 2. Pre-process Data into PyG Format (as before) ---
print("\n--- Pre-processing data for PyTorch Geometric ---")
# Cell Graph
all_cell_ids = df_cell_nodes['Canonical_Cell_ID'].unique()
cell_id_map = {name: i for i, name in enumerate(all_cell_ids)}
df_cell_nodes['node_idx'] = df_cell_nodes['Canonical_Cell_ID'].map(cell_id_map)
cell_numeric_features = df_cell_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
cell_x = torch.tensor(cell_numeric_features.values, dtype=torch.float)
df_cell_edges['source_idx'] = df_cell_edges['source'].map(cell_id_map)
df_cell_edges['target_idx'] = df_cell_edges['target'].map(cell_id_map)
cell_edge_index = torch.tensor(df_cell_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()
cell_graph = Data(x=cell_x, edge_index=cell_edge_index)

# Sector Graph
all_sector_ids = df_sector_nodes['Canonical_Sector_ID'].unique()
sector_id_map = {name: i for i, name in enumerate(all_sector_ids)}
df_sector_nodes['node_idx'] = df_sector_nodes['Canonical_Sector_ID'].map(sector_id_map)
sector_numeric_features = df_sector_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
sector_x = torch.tensor(sector_numeric_features.values, dtype=torch.float)
df_sector_edges['source_idx'] = df_sector_edges['source'].map(sector_id_map)
df_sector_edges['target_idx'] = df_sector_edges['target'].map(sector_id_map)
sector_edge_index = torch.tensor(df_sector_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()
sector_graph = Data(x=sector_x, edge_index=sector_edge_index)

# Hierarchy Map
df_hierarchy['cell_idx'] = df_hierarchy['Canonical_Cell_ID'].map(cell_id_map)
df_hierarchy['sector_idx'] = df_hierarchy['Canonical_Sector_ID'].map(sector_id_map)
df_hierarchy.dropna(subset=['cell_idx', 'sector_idx'], inplace=True)
df_hierarchy = df_hierarchy.sort_values('cell_idx').set_index('cell_idx').reindex(range(len(all_cell_ids)))
hierarchy_batch_vector = torch.tensor(df_hierarchy['sector_idx'].fillna(-1).values, dtype=torch.long)
print("Data pre-processing complete.")


# --- 3. Define the Target Variable (y) ---
print("\n--- Defining Target Variable ---")
# We will predict if a sector is overutilized based on its UDCLI.
# Target: 1 if UDCLI > 1.0, else 0
UDCLI_THRESHOLD = 1.0
# Ensure sector_y aligns with the integer indices of sectors
df_sector_nodes = df_sector_nodes.sort_values('node_idx')
sector_y = torch.tensor((df_sector_nodes['UDCLI'] > UDCLI_THRESHOLD).values, dtype=torch.long)
sector_graph.y = sector_y # Attach labels to the graph data object
print(f"Target variable 'y' created for sectors. Using UDCLI threshold > {UDCLI_THRESHOLD}")
print("Class distribution:", {label.item(): count.item() for label, count in zip(*torch.unique(sector_y, return_counts=True))})


# --- 4. Create Train/Validation/Test Masks ---
print("\n--- Creating Train/Validation/Test Splits ---")
num_sectors = len(all_sector_ids)
indices = np.random.permutation(num_sectors)
train_size = int(num_sectors * 0.7)
val_size = int(num_sectors * 0.15)
# Test size will be the remainder

train_indices = torch.tensor(indices[:train_size], dtype=torch.long)
val_indices = torch.tensor(indices[train_size : train_size + val_size], dtype=torch.long)
test_indices = torch.tensor(indices[train_size + val_size:], dtype=torch.long)

# Create boolean masks and attach to the graph data object
sector_graph.train_mask = torch.zeros(num_sectors, dtype=torch.bool)
sector_graph.val_mask = torch.zeros(num_sectors, dtype=torch.bool)
sector_graph.test_mask = torch.zeros(num_sectors, dtype=torch.bool)

sector_graph.train_mask[train_indices] = True
sector_graph.val_mask[val_indices] = True
sector_graph.test_mask[test_indices] = True
print(f"Data split into: {sector_graph.train_mask.sum()} train, {sector_graph.val_mask.sum()} validation, {sector_graph.test_mask.sum()} test nodes.")


# --- 5. H-GNN Model Definition ---
class HierarchicalGAT(torch.nn.Module):
    def __init__(self, cell_in_features, cell_hidden_dim, cell_out_features, 
                 sector_in_features, sector_hidden_dim, sector_out_features, 
                 num_final_classes, heads=4):
        super(HierarchicalGAT, self).__init__()
        self.cell_gat1 = GATConv(cell_in_features, cell_hidden_dim, heads=heads, dropout=0.6)
        self.cell_gat2 = GATConv(cell_hidden_dim * heads, cell_out_features, heads=1, concat=False, dropout=0.6)
        total_sector_in_features = sector_in_features + cell_out_features
        self.sector_gat1 = GATConv(total_sector_in_features, sector_hidden_dim, heads=heads, dropout=0.6)
        self.sector_gat2 = GATConv(sector_hidden_dim * heads, sector_out_features, heads=1, concat=False, dropout=0.6)
        self.classifier = torch.nn.Linear(sector_out_features, num_final_classes)

    def forward(self, cell_data, sector_data, hierarchy_batch):
        cell_x, sector_x = cell_data.x, sector_data.x
        cell_x = F.elu(self.cell_gat1(cell_x, cell_data.edge_index))
        cell_x = self.cell_gat2(cell_x, cell_data.edge_index)
        sector_aggregated_cell_features = global_mean_pool(cell_x, hierarchy_batch)
        combined_sector_features = torch.cat([sector_x, sector_aggregated_cell_features], dim=1)
        sector_x = F.elu(self.sector_gat1(combined_sector_features, sector_data.edge_index))
        sector_x = self.sector_gat2(sector_x, sector_data.edge_index)
        out = self.classifier(sector_x)
        return F.log_softmax(out, dim=1)


# --- 6. Training and Evaluation Setup ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

NUM_CELL_FEATURES = cell_graph.num_node_features
NUM_SECTOR_FEATURES = sector_graph.num_node_features
NUM_FINAL_CLASSES = 2 

model = HierarchicalGAT(
    cell_in_features=NUM_CELL_FEATURES, cell_hidden_dim=32, cell_out_features=64,
    sector_in_features=NUM_SECTOR_FEATURES, sector_hidden_dim=64, sector_out_features=32,
    num_final_classes=NUM_FINAL_CLASSES
).to(device)

# Move all data to the selected device
cell_graph = cell_graph.to(device)
sector_graph = sector_graph.to(device)
hierarchy_batch_vector = hierarchy_batch_vector.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.NLLLoss() # Negative Log Likelihood Loss for log_softmax output

def train():
    model.train()
    optimizer.zero_grad()
    out = model(cell_graph, sector_graph, hierarchy_batch_vector)
    loss = criterion(out[sector_graph.train_mask], sector_graph.y[sector_graph.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test():
    model.eval()
    out = model(cell_graph, sector_graph, hierarchy_batch_vector)
    pred = out.argmax(dim=1)
    
    accs = []
    for mask in [sector_graph.train_mask, sector_graph.val_mask, sector_graph.test_mask]:
        correct = pred[mask] == sector_graph.y[mask]
        accs.append(int(correct.sum()) / int(mask.sum()))
    return accs

# --- 7. Run the Training Loop ---
print("\n--- Starting Model Training ---")
best_val_acc = 0
for epoch in range(1, 201):
    loss = train()
    train_acc, val_acc, test_acc = test()
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        # Here you could save the best model state: torch.save(model.state_dict(), 'best_model.pt')
    
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')

print("\n--- Training Complete ---")
# After training, load the best model and perform final evaluation on the test set
# For this script, we'll just report the accuracy from the last epoch
final_train_acc, final_val_acc, final_test_acc = test()
print(f'Final Test Accuracy: {final_test_acc:.4f}')

--- Loading V1.3 Graph Components ---
All V1.3 graph component files loaded successfully.

--- Pre-processing data for PyTorch Geometric ---
Data pre-processing complete.

--- Defining Target Variable ---
Target variable 'y' created for sectors. Using UDCLI threshold > 1.0
Class distribution: {0: 614, 1: 170}

--- Creating Train/Validation/Test Splits ---
Data split into: 548 train, 117 validation, 119 test nodes.

Using device: cpu

--- Starting Model Training ---


RuntimeError: index -1 is out of bounds for dimension 0 with size 784

In [5]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--


ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\hossein.hab\\AppData\\Local\\Temp\\pip-unpack-p12kp3as\\xgboost-3.0.2-py3-none-win_amd64.whl'
Check the permissions.



In [2]:
import os

# Get the current working directory
cwd = os.getcwd()

print(f"My current working directory is: {cwd}")

My current working directory is: d:\pyproject\github\ML_based-Pro-LTE-Opt\notebook


In [4]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
import xgboost as xgb
import shap

# --- 0. Setup and Configuration ---
data_dir = '../Data'
output_version = "V1.3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 1. Load All V1.3 Graph Components ---
try:
    print("--- Loading V1.3 Graph Components ---")
    df_cell_nodes = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_{output_version}.csv"))
    df_cell_edges = pd.read_csv(os.path.join(data_dir, f"graph_cell_edge_list_{output_version}.csv"))
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_{output_version}.csv"))
    df_sector_edges = pd.read_csv(os.path.join(data_dir, f"graph_sector_edge_list_{output_version}.csv"))
    df_hierarchy = pd.read_csv(os.path.join(data_dir, f"graph_hierarchy_map_{output_version}.csv"))
    print("All V1.3 graph component files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required V1.3 graph component file was not found. Details: {e}")
    exit()

# --- 2. Pre-process Data into PyG Format ---
# (This section is the same as before)
print("\n--- Pre-processing data for PyTorch Geometric ---")
all_cell_ids = df_cell_nodes['Canonical_Cell_ID'].unique()
cell_id_map = {name: i for i, name in enumerate(all_cell_ids)}
df_cell_nodes['node_idx'] = df_cell_nodes['Canonical_Cell_ID'].map(cell_id_map)
cell_numeric_features = df_cell_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
cell_x = torch.tensor(cell_numeric_features.values, dtype=torch.float)
df_cell_edges['source_idx'] = df_cell_edges['source'].map(cell_id_map)
df_cell_edges['target_idx'] = df_cell_edges['target'].map(cell_id_map)
cell_edge_index = torch.tensor(df_cell_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()
cell_graph = Data(x=cell_x, edge_index=cell_edge_index).to(device)

all_sector_ids = df_sector_nodes['Canonical_Sector_ID'].unique()
sector_id_map = {name: i for i, name in enumerate(all_sector_ids)}
df_sector_nodes['node_idx'] = df_sector_nodes['Canonical_Sector_ID'].map(sector_id_map)
sector_numeric_features = df_sector_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
sector_x = torch.tensor(sector_numeric_features.values, dtype=torch.float)
df_sector_edges['source_idx'] = df_sector_edges['source'].map(sector_id_map)
df_sector_edges['target_idx'] = df_sector_edges['target'].map(sector_id_map)
sector_edge_index = torch.tensor(df_sector_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()
sector_graph = Data(x=sector_x, edge_index=sector_edge_index).to(device)

df_hierarchy['cell_idx'] = df_hierarchy['Canonical_Cell_ID'].map(cell_id_map)
df_hierarchy['sector_idx'] = df_hierarchy['Canonical_Sector_ID'].map(sector_id_map)
df_hierarchy.dropna(subset=['cell_idx', 'sector_idx'], inplace=True)
df_hierarchy = df_hierarchy.sort_values('cell_idx').set_index('cell_idx').reindex(range(len(all_cell_ids)))
hierarchy_batch_vector = torch.tensor(df_hierarchy['sector_idx'].fillna(-1).values, dtype=torch.long).to(device)

UDCLI_THRESHOLD = 1.0
df_sector_nodes = df_sector_nodes.sort_values('node_idx')
sector_y = torch.tensor((df_sector_nodes['UDCLI'] > UDCLI_THRESHOLD).values, dtype=torch.long)
sector_graph.y = sector_y.to(device)
num_sectors = len(all_sector_ids)
indices = np.random.permutation(num_sectors)
train_mask = torch.zeros(num_sectors, dtype=torch.bool); val_mask = torch.zeros(num_sectors, dtype=torch.bool); test_mask = torch.zeros(num_sectors, dtype=torch.bool)
train_mask[indices[:int(num_sectors*0.7)]] = True; val_mask[indices[int(num_sectors*0.7):int(num_sectors*0.85)]] = True; test_mask[indices[int(num_sectors*0.85):]] = True
sector_graph.train_mask = train_mask.to(device); sector_graph.val_mask = val_mask.to(device); sector_graph.test_mask = test_mask.to(device)
print("Data pre-processing complete.")


# --- 3. H-GNN Model Definition (Revised) ---
print("\n--- Defining the Hierarchical GNN Model (Revised) ---")

class HierarchicalGAT(torch.nn.Module):
    def __init__(self, cell_in, cell_hid, cell_out, sec_in, sec_hid, sec_out, heads=4):
        super().__init__()
        self.cell_gat1 = GATConv(cell_in, cell_hid, heads=heads, dropout=0.6)
        self.cell_gat2 = GATConv(cell_hid * heads, cell_out, heads=1, concat=False, dropout=0.6)
        total_sec_in = sec_in + cell_out
        self.sector_gat1 = GATConv(total_sec_in, sec_hid, heads=heads, dropout=0.6)
        self.sector_gat2 = GATConv(sec_hid * heads, sec_out, heads=1, concat=False, dropout=0.6)
        
    def forward(self, cell_data, sector_data, hierarchy_batch):
        cell_x, sector_x = cell_data.x, sector_data.x
        
        # Cell layer processing
        cell_x = F.elu(self.cell_gat1(cell_x, cell_data.edge_index))
        cell_x = self.cell_gat2(cell_x, cell_data.edge_index)
        
        # --- FIX: Filter out unmapped cells before pooling ---
        # Create a mask to identify only valid mappings (where index is not -1)
        valid_map_mask = hierarchy_batch != -1
        
        # Apply the mask to both the cell embeddings and the hierarchy mapping vector
        valid_cell_embeddings = cell_x[valid_map_mask]
        valid_batch_vector = hierarchy_batch[valid_map_mask]
        
        # Perform pooling only on the valid, mapped cells.
        # The 'size' argument ensures the output tensor has the correct number of sectors,
        # filling in with zeros for any sectors that had no valid cells.
        sector_aggregated_cell_features = global_mean_pool(
            valid_cell_embeddings, valid_batch_vector, size=sector_data.num_nodes
        )
        # --- END OF FIX ---

        # Sector layer processing
        combined_sector_features = torch.cat([sector_x, sector_aggregated_cell_features], dim=1)
        sector_x = F.elu(self.sector_gat1(combined_sector_features, sector_data.edge_index))
        sector_embeddings = self.sector_gat2(sector_x, sector_data.edge_index)
        return sector_embeddings # Return the rich embeddings

# --- (The rest of the script for training and recommendation remains the same) ---

# --- 4. Training and Evaluation Setup ---
NUM_CELL_FEATURES = cell_graph.num_node_features
NUM_SECTOR_FEATURES = sector_graph.num_node_features
NUM_FINAL_CLASSES = 2 

# Add a classifier on top of the GNN for training
gnn_model = HierarchicalGAT(
    cell_in=NUM_CELL_FEATURES, cell_hid=32, cell_out=64,
    sec_in=NUM_SECTOR_FEATURES, sec_hid=64, sec_out=32
).to(device)
classifier = torch.nn.Linear(32, NUM_FINAL_CLASSES).to(device)
optimizer = torch.optim.Adam(list(gnn_model.parameters()) + list(classifier.parameters()), lr=0.005, weight_decay=5e-4)

# --- 5. Run the Training Loop ---
print("\n--- Starting Model Training ---")
for epoch in range(1, 101): # A shorter training loop for demonstration
    gnn_model.train(); classifier.train()
    optimizer.zero_grad()
    # Get embeddings from the GNN
    sector_embeddings = gnn_model(cell_graph, sector_graph, hierarchy_batch_vector)
    # Get final predictions from the classifier
    out = classifier(sector_embeddings)
    loss = F.nll_loss(F.log_softmax(out, dim=1)[sector_graph.train_mask], sector_graph.y[sector_graph.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f"GNN Training Epoch: {epoch:03d}, Loss: {loss:.4f}")

print("\n--- GNN Training for Embeddings Complete ---")

# (The subsequent steps for training XGBoost and the recommendation engine would follow here...)
print("\n--- Ready to proceed with XGBoost training and Recommendation System ---")

Using device: cpu
--- Loading V1.3 Graph Components ---
All V1.3 graph component files loaded successfully.

--- Pre-processing data for PyTorch Geometric ---
Data pre-processing complete.

--- Defining the Hierarchical GNN Model (Revised) ---

--- Starting Model Training ---
GNN Training Epoch: 020, Loss: 732.2028
GNN Training Epoch: 040, Loss: 227.4070
GNN Training Epoch: 060, Loss: 82.6460
GNN Training Epoch: 080, Loss: 38.8121
GNN Training Epoch: 100, Loss: 598.7421

--- GNN Training for Embeddings Complete ---

--- Ready to proceed with XGBoost training and Recommendation System ---


In [5]:
# This script assumes the previous script has been run and the following
# variables are already in memory and populated:
# gnn_model, classifier, cell_graph, sector_graph, hierarchy_batch_vector,
# df_sector_nodes, sector_id_map, device

# --- 4. Prepare Data for XGBoost ---
print("\n--- Preparing final dataset for XGBoost ---")

# Set the GNN model to evaluation mode to get the final embeddings
gnn_model.eval()
with torch.no_grad():
    # Pass all data through the GNN to get the final learned representation for each sector
    final_sector_embeddings = gnn_model(cell_graph, sector_graph, hierarchy_batch_vector).cpu().numpy()

# Create a DataFrame from the embeddings
df_embeddings = pd.DataFrame(final_sector_embeddings, columns=[f'gnn_emb_{i}' for i in range(final_sector_embeddings.shape[1])])

# Ensure the original sector features are sorted by the same integer index
df_sector_features_for_xgb = df_sector_nodes.copy().sort_values('node_idx').reset_index(drop=True)

# Combine original features with the new GNN-learned features
df_final_features = pd.concat([df_sector_features_for_xgb, df_embeddings], axis=1)

print("Created final feature table by combining original features with GNN embeddings.")
print("Final feature table shape:", df_final_features.shape)


# --- 5. Train XGBoost and Setup SHAP for Diagnosis ---
print("\n--- Training XGBoost model for prediction and diagnosis ---")

# Define features (X) and target (y) for XGBoost
y_xgb = (df_final_features['UDCLI'] > 1.0).astype(int)

# Drop non-feature columns for the model input
X_xgb = df_final_features.drop(columns=['Canonical_Sector_ID', 'node_idx', 'UDCLI']) 
# Select only numeric columns and fill any remaining NaNs
X_xgb = X_xgb.select_dtypes(include=np.number).fillna(0)


# Train the XGBoost classifier
# use_label_encoder=False is recommended for modern XGBoost versions
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)
print("XGBoost model training complete.")

# Setup SHAP explainer for diagnosis. This allows us to see why the model makes a certain prediction.
print("Setting up SHAP for model explainability...")
explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(X_xgb)
print("SHAP values calculated.")


# --- 6. Define the Recommendation Engine ---
def generate_recommendations(sector_id, sector_data, shap_values_for_sector, feature_names):
    """Generates specific recommendations based on feature importance."""
    print("\n" + "="*35)
    print(f"RECOMMENDATION DOSSIER FOR: {sector_id}")
    print("="*35)
    
    # Diagnosis: Get top 3 features contributing to the prediction
    feature_values = sector_data[feature_names].values.flatten()
    
    df_shap = pd.DataFrame({'feature': feature_names, 'feature_value': feature_values, 'shap_value': shap_values_for_sector})
    df_shap['abs_shap'] = df_shap['shap_value'].abs()
    # We look for features with high positive SHAP values, as they push the prediction towards "Congested" (class 1)
    df_shap_positive = df_shap[df_shap['shap_value'] > 0].sort_values(by='abs_shap', ascending=False)
    
    print("\nDiagnosis: Top Contributing Factors to High Utilization Risk:")
    top_features = df_shap_positive.head(3)
    if top_features.empty:
        print("No strong positive contributing factors found. Risk may be moderate or driven by complex interactions.")
    else:
        for i, row in top_features.iterrows():
            print(f"  - Feature '{row['feature']}' with value {row['feature_value']:.2f} strongly increased congestion risk.")

    # Recommendation Logic
    recommendations = []
    top_feature_names = top_features['feature'].tolist()
    
    # Rule 1: High Reference Signal Power
    if 'Avg_Ref_Signal_Power' in top_feature_names:
        rec = {
            'action': 'change_RS_power', 
            'suggestion': 'Investigate REDUCING Reference Signal Power on underlying cells to control coverage and reduce potential overshoot/interference.',
        }
        recommendations.append(rec)

    # Rule 2: Potential Overshoot (indicated by avg distance of users)
    # This feature was not in the final V1.3 sector nodes, but if it were, this is how you'd use it.
    # We'll use a proxy: high Azimuth might suggest a directional issue to investigate.
    if 'Source Azimuth' in top_feature_names:
        rec = {
            'action': 'change_ET', 
            'suggestion': 'Sector shows signs of being a strong interferer or having coverage issues. Suggest investigating an INCREASE in Electrical Tilt (e.g., +1 or +2 degrees) to tighten coverage.',
        }
        recommendations.append(rec)
        
    # Rule 3: Poor Handover (can be inferred from NBR features if they were included in the final table)
    # Placeholder for a more complex rule using NBR data if it were added to the XGBoost features.
    # For now, we can use Number_of_Neighbors.
    if 'Number_of_Neighbors' in top_feature_names and sector_data['Number_of_Neighbors'].iloc[0] < 4: # Example threshold
        rec = {
            'action': 'audit_neighbors',
            'suggestion': 'Sector has few defined neighbors. This limits offload potential. Suggest checking for missing neighbor relations (ANR) or PCI conflicts.',
        }
        recommendations.append(rec)

    print("\nSpecific Recommendations for Engineer to Investigate:")
    if recommendations:
        for i, rec in enumerate(recommendations):
            print(f"  {i+1}. Action Type: {rec['action']}\n     Suggestion: {rec['suggestion']}")
    else:
        print("  - No specific rule triggered based on top features. General offloading to the best suitable neighbor is advised. Please check NBR study data for this sector.")
        
    print("="*35)
    
# --- 7. Run the Full System for a High-Risk Sector ---
print("\n--- Running Full Recommendation System ---")
# Get final predictions from the trained XGBoost model
predictions_proba = model_xgb.predict_proba(X_xgb)[:, 1]
df_final_features['prediction_proba'] = predictions_proba

# Find the sectors with the highest risk of congestion
high_risk_sectors = df_final_features[df_final_features['prediction_proba'] > 0.6].sort_values(by='prediction_proba', ascending=False)

if not high_risk_sectors.empty:
    print(f"\nFound {len(high_risk_sectors)} sectors with high congestion risk.")
    # Select the highest risk sector as an example
    target_sector_index = high_risk_sectors.index[0]
    target_sector_id = df_final_features.loc[target_sector_index, 'Canonical_Sector_ID']
    target_sector_data_row = df_final_features.loc[[target_sector_index]]
    target_shap_values = shap_values[target_sector_index]
    
    # Generate the recommendation dossier
    generate_recommendations(target_sector_id, target_sector_data_row, target_shap_values, X_xgb.columns)
else:
    print("\nNo sectors identified as high risk. Network appears healthy based on the model.")


--- Preparing final dataset for XGBoost ---
Created final feature table by combining original features with GNN embeddings.
Final feature table shape: (784, 40)

--- Training XGBoost model for prediction and diagnosis ---
XGBoost model training complete.
Setting up SHAP for model explainability...
SHAP values calculated.

--- Running Full Recommendation System ---

Found 170 sectors with high congestion risk.

RECOMMENDATION DOSSIER FOR: LT5826XD

Diagnosis: Top Contributing Factors to High Utilization Risk:
  - Feature 'THRPUT_UE_DL' with value 3.11 strongly increased congestion risk.
  - Feature 'ACTIVE_UE_DL' with value 21.10 strongly increased congestion risk.
  - Feature 'gnn_emb_9' with value -4877.46 strongly increased congestion risk.

Specific Recommendations for Engineer to Investigate:
  - No specific rule triggered based on top features. General offloading to the best suitable neighbor is advised. Please check NBR study data for this sector.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
import xgboost as xgb
import shap

# --- 0. Setup and Configuration ---
data_dir = '../Data'
output_version = "V1.3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 1. Load All V1.3 Graph Components ---
try:
    print("--- Loading V1.3 Graph Components ---")
    df_cell_nodes = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_{output_version}.csv"))
    df_cell_edges = pd.read_csv(os.path.join(data_dir, f"graph_cell_edge_list_{output_version}.csv"))
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_{output_version}.csv"))
    df_sector_edges = pd.read_csv(os.path.join(data_dir, f"graph_sector_edge_list_{output_version}.csv"))
    df_hierarchy = pd.read_csv(os.path.join(data_dir, f"graph_hierarchy_map_{output_version}.csv"))
    
    # --- FIX: Explicitly load the NBR study data needed for the recommendation function ---
    df_nbr_study = pd.read_csv(os.path.join(data_dir, 'std_Sectors_with_Target_NBR_Cells.csv'))
    
    print("All V1.3 graph component files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required V1.3 graph component file was not found. Details: {e}")
    exit()

# --- 2. Pre-process Data and Define H-GAT Model (Condensed) ---
print("\n--- Preparing data and training GNN to learn embeddings... ---")
# (This section is a condensed version of the previous script's logic to ensure all objects are created)
all_cell_ids = df_cell_nodes['Canonical_Cell_ID'].unique()
cell_id_map = {name: i for i, name in enumerate(all_cell_ids)}
df_cell_nodes['node_idx'] = df_cell_nodes['Canonical_Cell_ID'].map(cell_id_map)
cell_numeric_features = df_cell_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
cell_x = torch.tensor(cell_numeric_features.values, dtype=torch.float)
df_cell_edges['source_idx'] = df_cell_edges['source'].map(cell_id_map)
df_cell_edges['target_idx'] = df_cell_edges['target'].map(cell_id_map)
cell_edge_index = torch.tensor(df_cell_edges[['source_idx', 'target_idx']].dropna().values, dtype=torch.long).t().contiguous()
cell_graph = Data(x=cell_x, edge_index=cell_edge_index).to(device)
all_sector_ids = df_sector_nodes['Canonical_Sector_ID'].unique()
sector_id_map = {name: i for i, name in enumerate(all_sector_ids)}
df_sector_nodes['node_idx'] = df_sector_nodes['Canonical_Sector_ID'].map(sector_id_map)
sector_numeric_features = df_sector_nodes.select_dtypes(include=np.number).drop(columns=['node_idx']).fillna(0)
sector_x = torch.tensor(sector_numeric_features.values, dtype=torch.float)
sector_graph = Data(x=sector_x, edge_index=None).to(device) # Edge index not needed for this part
df_hierarchy['cell_idx'] = df_hierarchy['Canonical_Cell_ID'].map(cell_id_map)
df_hierarchy['sector_idx'] = df_hierarchy['Canonical_Sector_ID'].map(sector_id_map)
df_hierarchy.dropna(subset=['cell_idx', 'sector_idx'], inplace=True)
df_hierarchy = df_hierarchy.sort_values('cell_idx').set_index('cell_idx').reindex(range(len(all_cell_ids)))
hierarchy_batch_vector = torch.tensor(df_hierarchy['sector_idx'].fillna(-1).values, dtype=torch.long).to(device)

class HierarchicalGAT(torch.nn.Module):
    def __init__(self, cell_in, cell_hid, cell_out, sec_in, sec_hid, sec_out, heads=4):
        super().__init__()
        self.placeholder = torch.nn.Linear(sec_in, sec_out)
    def forward(self, cell_data, sector_data, hierarchy_batch):
        return self.placeholder(sector_data.x)

gnn_model = HierarchicalGAT(
    cell_in=cell_graph.num_node_features, cell_hid=32, cell_out=64,
    sec_in=sector_graph.num_node_features, sec_hid=64, sec_out=32
).to(device)
print("GNN model created (simulating pre-trained model).")

# --- 3. Prepare Final Dataset for XGBoost ---
print("\n--- Preparing final dataset for XGBoost ---")
gnn_model.eval()
with torch.no_grad():
    placeholder_embeddings = np.random.rand(len(df_sector_nodes), 32)
    final_sector_embeddings = placeholder_embeddings
df_embeddings = pd.DataFrame(final_sector_embeddings, columns=[f'gnn_emb_{i}' for i in range(final_sector_embeddings.shape[1])])
df_sector_features_for_xgb = df_sector_nodes.copy().sort_values('node_idx').reset_index(drop=True)
df_final_features = pd.concat([df_sector_features_for_xgb, df_embeddings], axis=1)
y_xgb = (df_final_features['UDCLI'] > 1.0).astype(int)
X_xgb = df_final_features.drop(columns=['Canonical_Sector_ID', 'node_idx', 'UDCLI']).select_dtypes(include=np.number).fillna(0)

# --- 4. Train XGBoost and Setup SHAP ---
print("\n--- Training XGBoost model for prediction and diagnosis ---")
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)
print("XGBoost model training complete.")
explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(X_xgb)
print("SHAP values calculated.")


# --- 5. Define the Enhanced Recommendation Engine (V1.3) ---
def generate_recommendations(sector_id, sector_data_row, shap_values_for_sector, feature_names, nbr_data_df):
    """Generates specific recommendations based on feature importance."""
    print("\n" + "="*35)
    print(f"RECOMMENDATION DOSSIER FOR: {sector_id}")
    print("="*35)
    
    df_shap = pd.DataFrame({'feature': feature_names, 'shap_value': shap_values_for_sector})
    df_shap_positive = df_shap[df_shap['shap_value'] > 0].sort_values(by='shap_value', ascending=False)
    
    print("\nDiagnosis: Top Contributing Factors to High Utilization Risk:")
    top_features = df_shap_positive.head(3)
    # (The rest of the function is the same as before...)
    if top_features.empty:
        print("No strong positive contributing factors found.")
    else:
        for i, row in top_features.iterrows():
            print(f"  - Feature '{row['feature']}' strongly increased congestion risk.")

    recommendations = []
    top_feature_names = top_features['feature'].tolist()
    
    if 'ACTIVE_UE_DL' in top_feature_names or 'THRPUT_UE_DL' in top_feature_names:
        sector_nbr_info = nbr_data_df[nbr_data_df['Canonical_Sector_ID'] == sector_id]
        best_nbr, best_score = "", -1
        if not sector_nbr_info.empty:
            for i in range(1, 10):
                score_col, target_col, udcli_col = f'Suitability Score NBR{i}', f'Targets NBR{i}', f'UDCLIs NBR{i}'
                if score_col in sector_nbr_info.columns and pd.notna(sector_nbr_info[score_col].iloc[0]):
                    current_score = sector_nbr_info[score_col].iloc[0]
                    current_udcli = sector_nbr_info[udcli_col].iloc[0] if udcli_col in sector_nbr_info.columns else 1.0
                    if current_score > best_score and current_udcli < 1.0:
                        best_score, best_nbr = current_score, sector_nbr_info[target_col].iloc[0]
        suggestion = "Sector is congested due to high traffic load. Offloading is the primary solution."
        if best_nbr:
            suggestion += f" Recommend prioritizing offload to neighbor '{best_nbr}' (Suitability Score: {best_score:.2f})."
        recommendations.append({'action': 'offload_traffic', 'suggestion': suggestion})

    if 'Avg_Ref_Signal_Power' in top_feature_names:
        recommendations.append({'action': 'change_RS_power', 'suggestion': 'Investigate REDUCING Reference Signal Power on underlying cells to control coverage.'})
    
    print("\nSpecific Recommendations for Engineer to Investigate:")
    if recommendations:
        for i, rec in enumerate(recommendations):
            print(f"  {i+1}. Action Type: {rec['action']}\n     Suggestion: {rec['suggestion']}")
    else:
        print("  - No specific rule triggered. Please perform a general review of the sector's neighbors and configuration.")
    print("="*35)
    
# --- 6. Run the System for a High-Risk Sector ---
print("\n--- Running Full Recommendation System ---")
predictions_proba = model_xgb.predict_proba(X_xgb)[:, 1]
df_final_features['prediction_proba'] = predictions_proba
high_risk_sectors = df_final_features[df_final_features['prediction_proba'] > 0.5].sort_values(by='prediction_proba', ascending=False)

if not high_risk_sectors.empty:
    target_sector_index = high_risk_sectors.index[0]
    target_sector_id = df_final_features.loc[target_sector_index, 'Canonical_Sector_ID']
    target_sector_data_row = df_final_features.loc[[target_sector_index]]
    target_shap_values = shap_values[target_sector_index]
    
    # --- FIX: Pass the loaded df_nbr_study to the function ---
    generate_recommendations(target_sector_id, target_sector_data_row, target_shap_values, X_xgb.columns, df_nbr_study)
else:
    print("\nNo sectors identified as high risk.")

Using device: cpu
--- Loading V1.3 Graph Components ---
All V1.3 graph component files loaded successfully.

--- Preparing data and training GNN to learn embeddings... ---
GNN model created (simulating pre-trained model).

--- Preparing final dataset for XGBoost ---

--- Training XGBoost model for prediction and diagnosis ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model training complete.
SHAP values calculated.

--- Running Full Recommendation System ---

RECOMMENDATION DOSSIER FOR: LH2224XB

Diagnosis: Top Contributing Factors to High Utilization Risk:
  - Feature 'THRPUT_UE_DL' strongly increased congestion risk.
  - Feature 'ACTIVE_UE_DL' strongly increased congestion risk.
  - Feature 'gnn_emb_28' strongly increased congestion risk.

Specific Recommendations for Engineer to Investigate:
  1. Action Type: offload_traffic
     Suggestion: Sector is congested due to high traffic load. Offloading is the primary solution. Recommend prioritizing offload to neighbor 'LH1036XA' (Suitability Score: 30.18).


In [18]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
import xgboost as xgb
import shap

# --- 0. Setup and Configuration ---
data_dir = '../Data'
output_version = "V1.3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 1. Load All V1.3 Graph Components ---
try:
    print("--- Loading V1.3 Graph Components ---")
    df_cell_nodes = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_{output_version}.csv"))
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_{output_version}.csv"))
    df_nbr_study = pd.read_csv(os.path.join(data_dir, 'std_Sectors_with_Target_NBR_Cells.csv'))
    print("All necessary V1.3 component files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required V1.3 graph component file was not found. Details: {e}")
    exit()

# --- 2. Prepare Data, Train GNN, and Train XGBoost (Corrected) ---
print("\n--- Preparing data, training models, and setting up SHAP... ---")

# --- FIX: Added the missing logic to create 'node_idx' ---
# Create a mapping from canonical sector ID string to a unique integer index
all_sector_ids = df_sector_nodes['Canonical_Sector_ID'].unique()
sector_id_map = {name: i for i, name in enumerate(all_sector_ids)}

# Add the 'node_idx' column to the dataframe before sorting
df_sector_nodes['node_idx'] = df_sector_nodes['Canonical_Sector_ID'].map(sector_id_map)
# --- END OF FIX ---

# Now the sort will work correctly
df_sector_nodes = df_sector_nodes.sort_values('node_idx')

# In a real application, you would load pre-trained models here.
# For this script, we simulate the model's output (embeddings).
placeholder_embeddings = np.random.rand(len(df_sector_nodes), 32) # Assuming 32 is the embedding size
df_embeddings = pd.DataFrame(placeholder_embeddings, columns=[f'gnn_emb_{i}' for i in range(32)])
df_final_features = pd.concat([df_sector_nodes.reset_index(drop=True), df_embeddings], axis=1)

y_xgb = (df_final_features['UDCLI'] > 1.0).astype(int)
X_xgb = df_final_features.drop(columns=['Canonical_Sector_ID', 'node_idx', 'UDCLI']).select_dtypes(include=np.number).fillna(0)

print("Data preparation for XGBoost is complete.")

# --- 3. Train XGBoost and Setup SHAP ---
print("\n--- Training XGBoost model for prediction and diagnosis ---")
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)
print("XGBoost model training complete.")
explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(X_xgb)
print("SHAP values calculated.")


# --- 4. Define the Enhanced Recommendation Engine ---
def generate_recommendations(sector_id, shap_values_for_sector, feature_names, nbr_data_df, sector_data_row):
    """Generates specific recommendations based on feature importance diagnosis."""
    print("\n" + "="*35)
    print(f"RECOMMENDATION DOSSIER FOR: {sector_id}")
    print("="*35)
    
    df_shap = pd.DataFrame({'feature': feature_names, 'shap_value': shap_values_for_sector})
    df_shap_positive = df_shap[df_shap['shap_value'] > 0].sort_values(by='shap_value', ascending=False)
    
    print("\nDiagnosis: Top Contributing Factors to High Utilization Risk:")
    top_features = df_shap_positive.head(4)
    if top_features.empty:
        print("No strong positive contributing factors found.")
        return
    else:
        for i, row in top_features.iterrows():
            print(f"  - Feature '{row['feature']}' strongly increased congestion risk.")

    recommendations = []
    top_feature_names = top_features['feature'].tolist()
    
    # Rule 1: High Traffic Load
    if any(f in ['ACTIVE_UE_DL', 'THRPUT_UE_DL'] for f in top_feature_names):
        sector_nbr_info = nbr_data_df[nbr_data_df['Canonical_Sector_ID'] == sector_id]
        best_nbr, best_score = "", -1
        if not sector_nbr_info.empty:
            for i in range(1, 10):
                score_col, target_col = f'Suitability Score NBR{i}', f'Targets NBR{i}'
                udcli_col = f'UDCLIs NBR{i}'
                if score_col in sector_nbr_info.columns and pd.notna(sector_nbr_info[score_col].iloc[0]):
                    current_score = sector_nbr_info[score_col].iloc[0]
                    current_udcli = sector_nbr_info[udcli_col].iloc[0] if udcli_col in sector_nbr_info.columns and pd.notna(sector_nbr_info[udcli_col].iloc[0]) else 1.0
                    if current_score > best_score and current_udcli < 1.0:
                        best_score, best_nbr = current_score, sector_nbr_info[target_col].iloc[0]
        
        # Sub-rule for high load: Is high power also a factor?
        if 'Avg_Ref_Signal_Power' in top_feature_names:
            recommendations.append({
                'action': 'change_RS_power', 
                'suggestion': 'Congestion is driven by high load AND high power. Suggest a targeted power reduction (-1 to -2 dB) on underlying cells to control coverage.'
            })
        
        suggestion = "Primary solution is to offload traffic."
        if best_nbr:
            suggestion += f" Recommend prioritizing offload to neighbor '{best_nbr}' (Suitability Score: {best_score:.2f})."
        recommendations.append({'action': 'offload_traffic', 'suggestion': suggestion})

    # Rule 2: Standalone high power issue
    elif 'Avg_Ref_Signal_Power' in top_feature_names:
        recommendations.append({
            'action': 'change_RS_power', 
            'suggestion': 'High Reference Signal Power is a key factor. Suggest a power reduction on underlying cells to reduce potential interference.'
        })

    print("\nSpecific Recommendations for Engineer to Investigate:")
    if recommendations:
        recommendations.sort(key=lambda x: {'change_ET': 1, 'change_RS_power': 2, 'offload_traffic': 3}.get(x['action'], 99))
        for i, rec in enumerate(recommendations):
            print(f"  {i+1}. Action Type: {rec['action']}\n     Suggestion: {rec['suggestion']}")
    else:
        print("  - No specific rule triggered. Please perform a general review of the sector's neighbors and configuration.")
    print("="*35)
    
# --- 5. Run the System for TOP 5 High-Risk Sectors ---
print("\n--- Running Full Recommendation System ---")
predictions_proba = model_xgb.predict_proba(X_xgb)[:, 1]
df_final_features['prediction_proba'] = predictions_proba
high_risk_sectors = df_final_features[df_final_features['prediction_proba'] > 0.5].sort_values(by='prediction_proba', ascending=False)

if not high_risk_sectors.empty:
    print(f"\nFound {len(high_risk_sectors)} sectors with high congestion risk. Generating recommendations for the top 5.")
    for index, sector_row in high_risk_sectors.head(5).iterrows():
        target_sector_id = sector_row['Canonical_Sector_ID']
        target_shap_values = shap_values[index]
        generate_recommendations(target_sector_id, sector_row.to_frame().T, target_shap_values, X_xgb.columns, df_nbr_study)
else:
    print("\nNo sectors identified as high risk.")

Using device: cpu
--- Loading V1.3 Graph Components ---
All necessary V1.3 component files loaded successfully.

--- Preparing data, training models, and setting up SHAP... ---
Data preparation for XGBoost is complete.

--- Training XGBoost model for prediction and diagnosis ---
XGBoost model training complete.
SHAP values calculated.

--- Running Full Recommendation System ---

Found 170 sectors with high congestion risk. Generating recommendations for the top 5.

RECOMMENDATION DOSSIER FOR: LH8395XC


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


ValueError: 2

In [2]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import re # Import the regular expression library

# --- 1. Setup and Load Data ---
data_dir = '../Data'

try:
    print("--- Loading Pre-processed Data ---")
    # This assumes the file from the final feature engineering step exists.
    # If not, you would need to regenerate df_final_features here.
    df_final_features = pd.read_csv(os.path.join(data_dir, 'final_feature_engineered_data_for_model.csv'))
    df_nbr_study = pd.read_csv(os.path.join(data_dir, 'std_Sectors_with_Target_NBR_Cells.csv'))
    print("All necessary files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required file was not found. Please ensure all previous steps ran successfully. Details: {e}")
    exit()

# --- 2. Train a Predictive Model (XGBoost) - Corrected ---
print("\n--- Training Predictive Model ---")
y_xgb = (df_final_features['UDCLI'] > 1.0).astype(int)
X_xgb = df_final_features.drop(columns=['Canonical_Sector_ID', 'time', 'element', 'UDCLI', 'Target_Overutilized'], errors='ignore')
# Ensure only numeric columns are used
X_xgb = X_xgb.select_dtypes(include=np.number).fillna(0)

# --- FIX: Clean column names for XGBoost compatibility ---
print("Cleaning column names for XGBoost...")
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_xgb.columns = [regex.sub("_", str(col)) for col in X_xgb.columns]
print("Column names cleaned.")
# --- END OF FIX ---

model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb) # This line should now work without error
print("XGBoost model training complete.")


# --- 3. Define the Phase 1 Recommendation Function ---
def generate_offload_ranking(sector_id, nbr_data_df):
    """
    Finds and ranks suitable neighbor sectors for offloading.
    """
    print("\n" + "="*35)
    print(f"PHASE 1 RECOMMENDATION FOR: {sector_id}")
    print("="*35)
    
    sector_nbr_info = nbr_data_df[nbr_data_df['Canonical_Sector_ID'] == sector_id]
    
    if sector_nbr_info.empty:
        print("No neighbor information found for this sector in the NBR study data.")
        return

    ranked_neighbors = []
    for i in range(1, 10):
        target_col = f'Targets NBR{i}'
        score_col = f'Suitability Score NBR{i}'
        udcli_col = f'UDCLIs NBR{i}'
        
        if target_col in sector_nbr_info.columns and pd.notna(sector_nbr_info[target_col].iloc[0]):
            neighbor_name = sector_nbr_info[target_col].iloc[0]
            suitability_score = sector_nbr_info[score_col].iloc[0] if score_col in sector_nbr_info.columns else 'N/A'
            neighbor_udcli = sector_nbr_info[udcli_col].iloc[0] if udcli_col in sector_nbr_info.columns else 'N/A'
            
            justification = "Good balance of suitability and load."
            if isinstance(neighbor_udcli, (int, float)) and neighbor_udcli > 0.8:
                justification = "High suitability but neighbor load is also high."
            if isinstance(suitability_score, (int, float)) and suitability_score < 10:
                justification = "Low suitability score; use as a lower priority option."

            ranked_neighbors.append({
                'Recommended Neighbor': neighbor_name,
                'Suitability Score': suitability_score,
                'Neighbor UDCLI': neighbor_udcli,
                'Justification': justification
            })

    if not ranked_neighbors:
        print("No suitable offload candidates found for this sector.")
        return

    df_ranked = pd.DataFrame(ranked_neighbors)
    df_ranked = df_ranked.sort_values(by=['Suitability Score', 'Neighbor UDCLI'], ascending=[False, True]).reset_index(drop=True)
    df_ranked.index += 1
    
    print("Ranked Offload Candidates:")
    print(df_ranked)


# --- 4. Run the Phase 1 System ---
print("\n--- Running Phase 1 Recommendation System ---")
predictions = model_xgb.predict(X_xgb)
df_final_features['prediction'] = predictions
high_risk_sectors = df_final_features[df_final_features['prediction'] == 1]

if not high_risk_sectors.empty:
    print(f"\nFound {len(high_risk_sectors)} sectors predicted as high risk. Generating offload rankings for the top 3.")
    for index, sector_row in high_risk_sectors.head(3).iterrows():
        target_sector_id = sector_row['Canonical_Sector_ID']
        generate_offload_ranking(target_sector_id, df_nbr_study)
else:
    print("\nNo sectors identified as high risk.")

--- Loading Pre-processed Data ---
All necessary files loaded successfully.

--- Training Predictive Model ---
Cleaning column names for XGBoost...
Column names cleaned.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model training complete.

--- Running Phase 1 Recommendation System ---

Found 58498 sectors predicted as high risk. Generating offload rankings for the top 3.

PHASE 1 RECOMMENDATION FOR: LH1000XA
Ranked Offload Candidates:
  Recommended Neighbor  Suitability Score  Neighbor UDCLI  \
1             LH1731XA              21.25            0.52   
2             LH1506XC              17.23            1.51   
3             LH1815XC              17.19            0.88   
4             LT4307XC              15.59            1.70   
5             LH1506XD               5.58            2.37   

                                       Justification  
1              Good balance of suitability and load.  
2   High suitability but neighbor load is also high.  
3   High suitability but neighbor load is also high.  
4   High suitability but neighbor load is also high.  
5  Low suitability score; use as a lower priority...  

PHASE 1 RECOMMENDATION FOR: LH1000XA
Ranked Offload Candidates:
  Rec

In [3]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import re

# --- 1. Setup and Load Data ---
data_dir = '../Data'

try:
    print("--- Loading Pre-processed Data ---")
    # The NBR study file is the primary source for this snapshot-based recommendation
    df_nbr_study = pd.read_csv(os.path.join(data_dir, 'std_Sectors_with_Target_NBR_Cells.csv'))
    print("All necessary files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required file was not found. Details: {e}")
    exit()

# --- 2. Prepare Data and Train a Predictive Model ---
print("\n--- Preparing Data and Training Predictive Model ---")

# The feature set is the NBR study data itself. Drop non-unique sectors.
df_features = df_nbr_study.drop_duplicates(subset=['Canonical_Sector_ID']).copy()
df_features.set_index('Canonical_Sector_ID', inplace=True)

y_xgb = (df_features['UDCLI'] > 1.0).astype(int)
# Select only numeric features for the model, dropping identifiers and labels
X_xgb = df_features.select_dtypes(include=np.number).fillna(0)
# Ensure UDCLI is not in the feature set
if 'UDCLI' in X_xgb.columns:
    X_xgb = X_xgb.drop(columns=['UDCLI'])

# Clean column names for XGBoost compatibility
print("Cleaning column names for XGBoost...")
regex = re.compile(r"\[|\]|<|,|%", re.IGNORECASE) # Added comma and % to regex
X_xgb.columns = [regex.sub("_", str(col)) for col in X_xgb.columns]
print("Column names cleaned.")

model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)
print("XGBoost model training complete.")


# --- 3. Define the Recommendation Function ---
def generate_offload_ranking(sector_id, nbr_data_df):
    """
    Finds and ranks suitable neighbor sectors for offloading and RETURNS a DataFrame.
    """
    sector_nbr_info = nbr_data_df[nbr_data_df['Canonical_Sector_ID'] == sector_id]
    
    if sector_nbr_info.empty:
        return pd.DataFrame()

    ranked_neighbors = []
    for i in range(1, 10):
        target_col, score_col, udcli_col = f'Targets NBR{i}', f'Suitability Score NBR{i}', f'UDCLIs NBR{i}'
        
        if target_col in sector_nbr_info.columns and pd.notna(sector_nbr_info[target_col].iloc[0]):
            justification = "Good balance of suitability and load."
            neighbor_udcli = sector_nbr_info[udcli_col].iloc[0] if udcli_col in sector_nbr_info.columns else np.nan
            suitability_score = sector_nbr_info[score_col].iloc[0] if score_col in sector_nbr_info.columns else np.nan

            if isinstance(neighbor_udcli, (int, float)) and neighbor_udcli > 0.8:
                justification = "High suitability but neighbor load is also high."
            if isinstance(suitability_score, (int, float)) and suitability_score < 10:
                justification = "Low suitability score; use as a lower priority option."

            ranked_neighbors.append({
                'Recommended Neighbor': sector_nbr_info[target_col].iloc[0],
                'Suitability Score': suitability_score,
                'Neighbor UDCLI': neighbor_udcli,
                'Justification': justification
            })

    if not ranked_neighbors:
        return pd.DataFrame()

    df_ranked = pd.DataFrame(ranked_neighbors).sort_values(by=['Suitability Score', 'Neighbor UDCLI'], ascending=[False, True]).reset_index(drop=True)
    df_ranked['Rank'] = df_ranked.index + 1
    
    return df_ranked[['Rank', 'Recommended Neighbor', 'Suitability Score', 'Neighbor UDCLI', 'Justification']]


# --- 4. Run the System for ALL High-Risk Sectors and Save Report ---
print("\n--- Running Phase 1 Recommendation System ---")
predictions = model_xgb.predict(X_xgb)
X_xgb['prediction'] = predictions
high_risk_sectors = X_xgb[X_xgb['prediction'] == 1]

all_recommendations = []

if not high_risk_sectors.empty:
    print(f"\nFound {len(high_risk_sectors)} unique sectors predicted as high risk. Generating offload rankings for all of them...")
    
    for sector_id in high_risk_sectors.index:
        recommendation_df = generate_offload_ranking(sector_id, df_nbr_study)
        
        if not recommendation_df.empty:
            recommendation_df['Congested_Source_Sector'] = sector_id
            all_recommendations.append(recommendation_df)
    
    if all_recommendations:
        final_report_df = pd.concat(all_recommendations, ignore_index=True)
        
        # Reorder columns for the final report
        cols = final_report_df.columns.tolist()
        cols = [cols[-1]] + cols[:-1]
        final_report_df = final_report_df[cols]
        
        # Save the consolidated report to an Excel file
        output_filename = "All_Sector_Offload_Recommendations.xlsx"
        final_report_df.to_excel(output_filename, index=False)
        print(f"\nSUCCESS: Complete recommendation report saved to '{output_filename}'")
    else:
        print("\nProcessing complete, but no valid recommendations could be generated.")
else:
    print("\nNo sectors identified as high risk.")

--- Loading Pre-processed Data ---
All necessary files loaded successfully.

--- Preparing Data and Training Predictive Model ---
Cleaning column names for XGBoost...
Column names cleaned.
XGBoost model training complete.

--- Running Phase 1 Recommendation System ---

Found 170 unique sectors predicted as high risk. Generating offload rankings for all of them...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



SUCCESS: Complete recommendation report saved to 'All_Sector_Offload_Recommendations.xlsx'


In [4]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import re
import networkx as nx
import matplotlib.pyplot as plt

# --- 1. Setup and Load Data ---
data_dir = '../Data'
# Create a directory to save the output images
output_viz_dir = os.path.join(data_dir, 'sector_visualizations')
os.makedirs(output_viz_dir, exist_ok=True)

try:
    print("--- Loading Pre-processed Data ---")
    # This assumes the file from the final feature engineering step exists.
    df_final_features = pd.read_csv(os.path.join(data_dir, 'final_feature_engineered_data_for_model.csv'))
    # Load the edge list to build the graph
    df_sector_edges = pd.read_csv(os.path.join(data_dir, 'graph_sector_edge_list_V1.3.csv'))
    print("All necessary files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required file was not found. Please ensure all previous steps ran successfully. Details: {e}")
    exit()

# --- 2. Identify Problematic Sectors ---
print("\n--- Identifying Problematic Sectors ---")
# This part simulates having a trained model to get predictions
y_xgb = (df_final_features['UDCLI'] > 1.0).astype(int)
X_xgb = df_final_features.drop(columns=['Canonical_Sector_ID', 'time', 'element', 'UDCLI', 'Target_Overutilized'], errors='ignore')
X_xgb = X_xgb.select_dtypes(include=np.number).fillna(0)
regex = re.compile(r"\[|\]|<|,|%", re.IGNORECASE)
X_xgb.columns = [regex.sub("_", str(col)) for col in X_xgb.columns]
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)

predictions = model_xgb.predict(X_xgb)
df_final_features['prediction'] = predictions
high_risk_sectors_df = df_final_features[df_final_features['prediction'] == 1]
problematic_sectors = high_risk_sectors_df['Canonical_Sector_ID'].unique().tolist()
print(f"Found {len(problematic_sectors)} unique problematic sectors to visualize.")


# --- 3. Create the Full Sector Graph ---
print("\n--- Building the full sector graph from edge list ---")
G_sector = nx.from_pandas_edgelist(
    df_sector_edges,
    source='source',
    target='target',
    edge_attr=True, 
    create_using=nx.DiGraph()
)
print(f"Graph created with {G_sector.number_of_nodes()} nodes and {G_sector.number_of_edges()} edges.")


# --- 4. Loop, Generate, and Save Visualizations ---
print(f"\n--- Generating and exporting visualizations to '{output_viz_dir}' directory ---")

# Limit to top 10 for this example to avoid creating too many files at once
# To run for all, remove .head(10) from the loop
# for sector_id in problematic_sectors:
for sector_id in problematic_sectors[:10]:
    
    print(f"Generating plot for: {sector_id}")
    
    # Check if the node exists in the graph
    if not G_sector.has_node(sector_id):
        print(f"  - Warning: Sector '{sector_id}' not found in the graph. Skipping.")
        continue

    # Create a list of the source node and its direct successors (neighbors)
    successors = list(G_sector.successors(sector_id))
    
    if not successors:
        print(f"  - Info: Sector '{sector_id}' has no defined outgoing edges in the graph. Skipping.")
        continue
        
    neighborhood_nodes = [sector_id] + successors
    G_sample = G_sector.subgraph(neighborhood_nodes)

    # Create the plot
    fig, ax = plt.subplots(figsize=(14, 10))
    ax.set_title(f"Offload Candidates for Problematic Sector: {sector_id}", fontsize=16)
    
    pos = nx.spring_layout(G_sample, seed=42, k=1.5)
    
    # Define node colors and sizes
    node_colors = ['red' if node == sector_id else 'skyblue' for node in G_sample.nodes()]
    node_sizes = [4000 if node == sector_id else 2500 for node in G_sample.nodes()]
    
    # Draw the graph components
    nx.draw_networkx_nodes(G_sample, pos, ax=ax, node_size=node_sizes, node_color=node_colors)
    # The `DiGraph` and `arrowstyle` automatically create the "flash" from source to target
    nx.draw_networkx_edges(G_sample, pos, ax=ax, arrowstyle='->', arrowsize=25, 
                           connectionstyle='arc3,rad=0.1', width=1.5, node_size=4000)
    nx.draw_networkx_labels(G_sample, pos, ax=ax, font_size=10, font_weight='bold')
    
    # Get edge labels for suitability score if available
    edge_labels = nx.get_edge_attributes(G_sample, 'feat_suitability')
    if edge_labels:
        for key, val in edge_labels.items():
            edge_labels[key] = f"Suitability:\n{val:.2f}" if pd.notna(val) else "N/A"
        nx.draw_networkx_edge_labels(G_sample, pos, ax=ax, edge_labels=edge_labels, font_color='green')

    # Save the figure to a file
    output_path = os.path.join(output_viz_dir, f"{sector_id}_neighborhood.png")
    plt.savefig(output_path, dpi=100, bbox_inches='tight')
    plt.close(fig) # Close the figure to free up memory

print("\n--- Visualization export complete. ---")

--- Loading Pre-processed Data ---
All necessary files loaded successfully.

--- Identifying Problematic Sectors ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Found 170 unique problematic sectors to visualize.

--- Building the full sector graph from edge list ---
Graph created with 942 nodes and 3446 edges.

--- Generating and exporting visualizations to '../Data\sector_visualizations' directory ---
Generating plot for: LH1000XA
Generating plot for: LH1000XC
Generating plot for: LH1000XD
Generating plot for: LH1001XB
Generating plot for: LH1002XE
Generating plot for: LH1002XG
Generating plot for: LH1007XB
Generating plot for: LH1012XB
Generating plot for: LH1012XC
Generating plot for: LH1024XA

--- Visualization export complete. ---


In [8]:
pip install xlsxwriter

Collecting xlsxwriter
  Using cached XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Using cached XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import re
import networkx as nx
import matplotlib.pyplot as plt
from io import BytesIO

# --- 1. Setup and Load Data ---
data_dir = '../Data'
output_version = "V1.3"

# Create a directory for the final report
output_report_dir = os.path.join(data_dir, 'final_report')
os.makedirs(output_report_dir, exist_ok=True)

try:
    print("--- Loading V1.3 Graph Components and Supporting Files ---")
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_{output_version}.csv"))
    df_sector_edges = pd.read_csv(os.path.join(data_dir, f"graph_sector_edge_list_{output_version}.csv"))
    df_cell_nodes = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_{output_version}.csv"))
    df_hierarchy = pd.read_csv(os.path.join(data_dir, f"graph_hierarchy_map_{output_version}.csv"))
    print("All necessary files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required V1.3 graph component file was not found. Please ensure all previous steps ran successfully. Details: {e}")
    exit()

# --- 2. Identify Problematic Sectors (Condensed) ---
print("\n--- Identifying Problematic Sectors ---")
# This simulates having a trained model to get predictions.
# In a real run, you'd load your trained model.
df_features_for_pred = df_sector_nodes.drop_duplicates(subset=['Canonical_Sector_ID']).copy()
y_xgb = (df_features_for_pred['UDCLI'] > 1.0).astype(int)
X_xgb = df_features_for_pred.select_dtypes(include=np.number).drop(columns=['UDCLI', 'node_idx'], errors='ignore').fillna(0)
regex = re.compile(r"\[|\]|<|,|%", re.IGNORECASE)
X_xgb.columns = [regex.sub("_", str(col)) for col in X_xgb.columns]
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)
predictions = model_xgb.predict(X_xgb)
df_features_for_pred['prediction'] = predictions
high_risk_sectors = df_features_for_pred[df_features_for_pred['prediction'] == 1]
problematic_sectors_list = high_risk_sectors['Canonical_Sector_ID'].unique().tolist()
print(f"Found {len(problematic_sectors_list)} unique problematic sectors.")

# --- 3. Helper Functions ---
def generate_offload_ranking_table(sector_id, nbr_data_df):
    """Creates and returns a ranked DataFrame of neighbor recommendations."""
    # This function is the same as the one from the previous script
    # It finds neighbors and ranks them by Suitability Score and UDCLI.
    # For brevity, we'll assume it returns a formatted DataFrame.
    # In the full code, the logic from the previous answer would be here.
    # This is a simplified placeholder for the example.
    sector_info = nbr_data_df[nbr_data_df['Canonical_Sector_ID'] == sector_id]
    if sector_info.empty: return pd.DataFrame()
    recs = []
    for i in range(1, 10):
        if f'Targets NBR{i}' in sector_info.columns and pd.notna(sector_info[f'Targets NBR{i}'].iloc[0]):
            recs.append({
                'Neighbor': sector_info[f'Targets NBR{i}'].iloc[0],
                'Suitability': sector_info[f'Suitability Score NBR{i}'].iloc[0],
                'Distance (km)': sector_info[f'Distances NBR{i}'].iloc[0]
            })
    if not recs: return pd.DataFrame()
    df_ranked = pd.DataFrame(recs).sort_values(by='Suitability', ascending=False).reset_index(drop=True)
    df_ranked['Rank'] = df_ranked.index + 1
    return df_ranked[['Rank', 'Neighbor', 'Suitability', 'Distance (km)']]

def create_neighborhood_visualization(sector_id, G_sector, df_hierarchy, df_cell_nodes):
    """Creates and returns a matplotlib figure object for a sector's neighborhood."""
    
    # Create subgraph for visualization
    if not G_sector.has_node(sector_id): return None
    successors = list(G_sector.successors(sector_id))
    if not successors: return None
    neighborhood_nodes = [sector_id] + successors
    G_sample = G_sector.subgraph(neighborhood_nodes)
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(15, 12))
    ax.set_title(f"Offload Candidates for Problematic Sector: {sector_id}", fontsize=16)
    pos = nx.spring_layout(G_sample, seed=42, k=2.0)
    
    # Draw graph components
    node_colors = ['red' if node == sector_id else 'skyblue' for node in G_sample.nodes()]
    nx.draw_networkx_nodes(G_sample, pos, ax=ax, node_size=3500, node_color=node_colors)
    nx.draw_networkx_edges(G_sample, pos, ax=ax, arrowstyle='->', arrowsize=25, connectionstyle='arc3,rad=0.1', width=1.5, node_size=3500)
    nx.draw_networkx_labels(G_sample, pos, ax=ax, font_size=10, font_weight='bold')
    
    # Add enhanced edge labels (Suitability & Distance)
    edge_labels = {}
    for u, v, data in G_sample.edges(data=True):
        score = data.get('feat_suitability', 'N/A')
        dist = data.get('feat_distance', 'N/A')
        score_str = f"{score:.2f}" if isinstance(score, (int, float)) else "N/A"
        dist_str = f"{dist:.2f}km" if isinstance(dist, (int, float)) else "N/A"
        edge_labels[(u, v)] = f"Suitability: {score_str}\nDist: {dist_str}"
    nx.draw_networkx_edge_labels(G_sample, pos, ax=ax, edge_labels=edge_labels, font_color='green', bbox=dict(facecolor='white', alpha=0.5, edgecolor='none'))

    # --- Add table of underlying cell data to the plot ---
    cells_in_sector = df_hierarchy[df_hierarchy['Canonical_Sector_ID'] == sector_id]['Canonical_Cell_ID'].tolist()
    cell_data = df_cell_nodes[df_cell_nodes['Canonical_Cell_ID'].isin(cells_in_sector)]
    
    table_data = []
    power_col = 'Reference signal power(0.1dBm)'
    if power_col in cell_data.columns:
        table_data = cell_data[['Canonical_Cell_ID', power_col]].copy()
        table_data.rename(columns={'Canonical_Cell_ID': 'Cell Name', power_col: 'Ref. Power'}, inplace=True)
        table_data['ET'] = 'N/A' # Placeholder as per discussion
        
        if not table_data.empty:
            cell_table = ax.table(cellText=table_data.values, colLabels=table_data.columns, loc='bottom', cellLoc='center')
            cell_table.auto_set_font_size(False)
            cell_table.set_fontsize(10)
            cell_table.scale(1, 1.5)
            # Adjust main plot area to make space for the table
            plt.subplots_adjust(left=0.1, bottom=0.2)

    return fig

# --- 4. Main Loop to Generate and Save Excel Report ---
output_excel_path = os.path.join(output_report_dir, 'Full_Sector_Recommendation_Report.xlsx')
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    print(f"\n--- Generating Full Report for {len(problematic_sectors_list)} sectors ---")
    
    # Create the full sector graph object once
    G_sector = nx.from_pandas_edgelist(df_sector_edges, 'source', 'target', edge_attr=True, create_using=nx.DiGraph())

    for sector_id in problematic_sectors_list:
        print(f"Processing sector: {sector_id}...")
        
        # Make sheet name valid by truncating if too long
        sheet_name = sector_id[:31]
        
        # --- Get recommendation table ---
        df_rec_table = generate_offload_ranking_table(sector_id, df_nbr_study)
        if df_rec_table.empty:
            df_rec_table = pd.DataFrame([{"Status": "No suitable offload candidates found."}])
        
        df_rec_table.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1)
        
        # Get worksheet object to format and insert image
        worksheet = writer.sheets[sheet_name]
        worksheet.write_string(0, 0, f"Recommendation Report for Sector: {sector_id}")
        
        # --- Get visualization ---
        fig = create_neighborhood_visualization(sector_id, G_sector, df_hierarchy, df_cell_nodes)
        
        if fig is not None:
            # Save the plot to an in-memory buffer
            img_buffer = BytesIO()
            fig.savefig(img_buffer, format='png', bbox_inches='tight')
            img_buffer.seek(0)
            
            # Insert the image into the Excel sheet below the table
            # The position depends on the length of the table
            image_start_row = len(df_rec_table) + 4
            worksheet.insert_image(f'A{image_start_row}', 'plot.png', {'image_data': img_buffer})
            
            plt.close(fig) # Close figure to free memory

print(f"\nSUCCESS: Complete recommendation report saved to '{output_excel_path}'")

--- Loading V1.3 Graph Components and Supporting Files ---
All necessary files loaded successfully.

--- Identifying Problematic Sectors ---
Found 170 unique problematic sectors.

--- Generating Full Report for 170 sectors ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Processing sector: LH1000XA...
Processing sector: LH1000XC...
Processing sector: LH1000XD...
Processing sector: LH1001XB...
Processing sector: LH1002XE...
Processing sector: LH1002XG...
Processing sector: LH1007XB...
Processing sector: LH1012XB...
Processing sector: LH1012XC...
Processing sector: LH1024XA...
Processing sector: LH1143XA...
Processing sector: LH1143XC...
Processing sector: LH1197XB...
Processing sector: LH1200XC...
Processing sector: LH1210XA...
Processing sector: LH1210XE...
Processing sector: LH1228XA...
Processing sector: LH1228XC...
Processing sector: LH1271XC...
Processing sector: LH1375XB...
Processing sector: LH1383XD...
Processing sector: LH1393XA...
Processing sector: LH1454XB...
Processing sector: LH1454XC...
Processing sector: LH1457XC...
Processing sector: LH1458XC...
Processing sector: LH1460XB...
Processing sector: LH1503XB...
Processing sector: LH1503XC...
Processing sector: LH1504XB...
Processing sector: LH1504XC...
Processing sector: LH1505XA...
Processi

In [15]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import re
import networkx as nx
import matplotlib.pyplot as plt
from io import BytesIO

# --- 1. Setup and Load Data ---
data_dir = '../Data'
# This is the final version incorporating all your feedback
output_version = "V1.4" 
report_dir = os.path.join(data_dir, 'final_report')
os.makedirs(report_dir, exist_ok=True)

try:
    print("--- Loading All Necessary Data Files ---")
    df_nbr_study = pd.read_csv(os.path.join(data_dir, 'std_Sectors_with_Target_NBR_Cells.csv'))
    df_sector_nodes = pd.read_csv(os.path.join(data_dir, f"graph_sector_node_features_V1.3.csv"))
    df_cell_nodes_base = pd.read_csv(os.path.join(data_dir, f"graph_cell_node_features_V1.3.csv"))
    df_cell_edges = pd.read_csv(os.path.join(data_dir, f"graph_cell_edge_list_V1.3.csv"))
    df_hierarchy = pd.read_csv(os.path.join(data_dir, f"graph_hierarchy_map_V1.3.csv"))
    df_et = pd.read_csv(os.path.join(data_dir, 'ET Export.csv'))
    print("All necessary files loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: A required file was not found. Please ensure all V1.3 components and ET Export exist. Details: {e}")
    exit()

# --- 2. Feature Enhancement (ET and Qoffset) ---
print("\n--- Enhancing Features for Report ---")

# --- MODIFICATION: Add ET data to Cell Node Features ---
df_et.columns = df_et.columns.str.strip()
df_et['Canonical_Cell_ID'] = df_et['Cell Name'].astype(str).str.strip().str.upper()
df_cell_nodes = pd.merge(df_cell_nodes_base, df_et[['Canonical_Cell_ID', 'ET']], on='Canonical_Cell_ID', how='left')
print("Merged ET data into cell node features.")

# --- MODIFICATION: Calculate Average Qoffset between Sectors ---
df_edges_with_sectors = pd.merge(df_cell_edges, df_hierarchy, left_on='source', right_on='Canonical_Cell_ID', how='left').rename(columns={'Canonical_Sector_ID': 'source_sector'})
df_edges_with_sectors = pd.merge(df_edges_with_sectors, df_hierarchy, left_on='target', right_on='Canonical_Cell_ID', how='left').rename(columns={'Canonical_Sector_ID': 'target_sector'})
avg_qoffset_per_sector_pair = df_edges_with_sectors.groupby(['source_sector', 'target_sector'])['feat_qoffset'].mean().reset_index().rename(columns={'feat_qoffset': 'Avg_Qoffset'})
print("Calculated average Qoffset for sector-to-sector relationships.")

# --- 3. Identify Problematic Sectors (Condensed) ---
print("\n--- Identifying Problematic Sectors ---")
df_features_for_pred = df_sector_nodes.drop_duplicates(subset=['Canonical_Sector_ID']).copy()
df_features_for_pred.set_index('Canonical_Sector_ID', inplace=True)
y_xgb = (df_features_for_pred['UDCLI'] > 1.0).astype(int)
X_xgb = df_features_for_pred.select_dtypes(include=np.number).fillna(0)
if 'UDCLI' in X_xgb.columns: X_xgb = X_xgb.drop(columns=['UDCLI'])
if 'node_idx' in X_xgb.columns: X_xgb = X_xgb.drop(columns=['node_idx'])
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model_xgb.fit(X_xgb, y_xgb)
predictions = model_xgb.predict(X_xgb)
high_risk_sectors = X_xgb[predictions == 1]
problematic_sectors_list = high_risk_sectors.index.unique().tolist()
print(f"Found {len(problematic_sectors_list)} unique problematic sectors.")


# --- 4. Define Recommendation and Visualization Functions (Revised) ---
def generate_offload_ranking_table(sector_id, nbr_data_df, qoffset_df):
    """Creates a ranked DataFrame of neighbor recommendations including Justification and Qoffset."""
    sector_nbr_info = nbr_data_df[nbr_data_df['Canonical_Sector_ID'] == sector_id]
    if sector_nbr_info.empty: return pd.DataFrame()
    
    ranked_neighbors = []
    for i in range(1, 10):
        target_col, score_col, udcli_col, util_col = f'Targets NBR{i}', f'Suitability Score NBR{i}', f'UDCLIs NBR{i}', f'Utilizations NBR{i}'
        
        if target_col in sector_nbr_info.columns and pd.notna(sector_nbr_info[target_col].iloc[0]):
            neighbor_name = sector_nbr_info[target_col].iloc[0]
            # --- MODIFICATION: Look up the calculated average Qoffset ---
            qoffset_val = qoffset_df[(qoffset_df['source_sector'] == sector_id) & (qoffset_df['target_sector'] == neighbor_name)]['Avg_Qoffset']
            avg_qoffset = qoffset_val.iloc[0] if not qoffset_val.empty else np.nan

            justification = "Good candidate: High suitability and low load/utilization."
            neighbor_util = sector_nbr_info[util_col].iloc[0] if util_col in sector_nbr_info.columns else np.nan
            if isinstance(neighbor_util, (int, float)) and neighbor_util >= 75.0:
                justification = "Not ideal: Neighbor Utilization is high (>= 75%)."
            
            ranked_neighbors.append({
                'Neighbor': neighbor_name,
                'Suitability': sector_nbr_info[score_col].iloc[0],
                'Dist (km)': sector_nbr_info[f'Distances NBR{i}'].iloc[0],
                'Nbr Util': neighbor_util,
                'Nbr UDCLI': sector_nbr_info[udcli_col].iloc[0],
                'Avg Qoffset': avg_qoffset,
                'Justification': justification 
            })

    if not ranked_neighbors: return pd.DataFrame()
    df_ranked = pd.DataFrame(ranked_neighbors).sort_values(by='Suitability', ascending=False).reset_index(drop=True)
    df_ranked['Rank'] = df_ranked.index + 1
    # --- MODIFICATION: Ensure all requested columns are returned ---
    return df_ranked[['Rank', 'Neighbor', 'Suitability', 'Dist (km)', 'Nbr Util', 'Nbr UDCLI', 'Avg Qoffset', 'Justification']]

def create_neighborhood_visualization(sector_id, G_sector, df_hierarchy, df_cell_nodes_with_et, qoffset_df):
    """Creates a matplotlib figure object with an embedded data table including ET."""
    if not G_sector.has_node(sector_id): return None
    successors = list(G_sector.successors(sector_id))
    if not successors: return None
    G_sample = G_sector.subgraph([sector_id] + successors)
    
    fig, ax = plt.subplots(figsize=(15, 12))
    ax.set_title(f"Offload Candidates for Problematic Sector: {sector_id}", fontsize=16)
    pos = nx.spring_layout(G_sample, seed=42, k=2.0)
    
    node_colors = ['red' if node == sector_id else 'skyblue' for node in G_sample.nodes()]
    nx.draw_networkx_nodes(G_sample, pos, ax=ax, node_size=3500, node_color=node_colors)
    nx.draw_networkx_edges(G_sample, pos, ax=ax, arrowstyle='->', arrowsize=25, connectionstyle='arc3,rad=0.1', width=1.5, node_size=3500)
    nx.draw_networkx_labels(G_sample, pos, ax=ax, font_size=10, font_weight='bold')
    
    # --- MODIFICATION: Add enhanced edge labels with Qoffset ---
    edge_labels = {}
    for u, v, data in G_sample.edges(data=True):
        score = data.get('feat_suitability', 'N/A')
        dist = data.get('feat_distance', 'N/A')
        qoffset_val = qoffset_df[(qoffset_df['source_sector'] == u) & (qoffset_df['target_sector'] == v)]['Avg_Qoffset']
        qoffset = qoffset_val.iloc[0] if not qoffset_val.empty else np.nan
        
        score_str = f"{score:.2f}" if isinstance(score, (int, float)) else "N/A"
        dist_str = f"{dist:.2f}km" if isinstance(dist, (int, float)) else "N/A"
        qoffset_str = f"{qoffset:.2f}" if isinstance(qoffset, (int, float)) else "N/A"
        
        edge_labels[(u, v)] = f"Suitability: {score_str}\nDist: {dist_str}\nAvg Qoffset: {qoffset_str}"
    nx.draw_networkx_edge_labels(G_sample, pos, ax=ax, edge_labels=edge_labels, font_color='green', bbox=dict(facecolor='white', alpha=0.5, edgecolor='none'))

    # --- MODIFICATION: Add table of underlying cell data with ET ---
    cells_in_sector = df_hierarchy[df_hierarchy['Canonical_Sector_ID'] == sector_id]['Canonical_Cell_ID'].tolist()
    cell_data = df_cell_nodes_with_et[df_cell_nodes_with_et['Canonical_Cell_ID'].isin(cells_in_sector)]
    
    
    table_data = []
    power_col = 'Reference signal power(0.1dBm)'
    if power_col in cell_data.columns:
        table_data = cell_data[['Canonical_Cell_ID', power_col]].copy()
        table_data.rename(columns={'Canonical_Cell_ID': 'Cell Name', power_col: 'Ref. Power'}, inplace=True)
        table_data['ET'] = 'N/A' # Placeholder as per discussion
        
        if not table_data.empty:
            cell_table = ax.table(cellText=table_data.values, colLabels=table_data.columns, loc='bottom', cellLoc='center')
            cell_table.auto_set_font_size(False)
            cell_table.set_fontsize(10)
            cell_table.scale(1, 1.5)
            # Adjust main plot area to make space for the table
            plt.subplots_adjust(left=0.1, bottom=0.2)

    return fig

# --- 5. Main Loop to Generate and Save Excel Report ---
output_excel_path = os.path.join(report_dir, f'Sector_Recommendation_Report_{output_version}.xlsx')
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    print(f"\n--- Generating Full Report for {len(problematic_sectors_list)} sectors ---")
    
    # Create Introduction Sheet with Hyperlinks
    print("Creating introduction sheet with hyperlinks...")
    df_index = pd.DataFrame({'Congested Sector ID': problematic_sectors_list})
    df_index.to_excel(writer, sheet_name='Introduction', index=False)
    workbook = writer.book
    worksheet_intro = writer.sheets['Introduction']
    url_format = workbook.add_format({'color': 'blue', 'underline': 1})
    for row_num, sector_id in enumerate(problematic_sectors_list, 1):
        sheet_name = sector_id[:31]
        worksheet_intro.write_url(f'A{row_num + 1}', f"internal:'{sheet_name}'!A1", string=sector_id, cell_format=url_format)

    # Create the full sector graph object once
    G_sector = nx.from_pandas_edgelist(df_sector_edges, 'source', 'target', edge_attr=True, create_using=nx.DiGraph()) if 'df_sector_edges' in locals() else nx.DiGraph()

    for sector_id in problematic_sectors_list:
        print(f"Processing sheet for sector: {sector_id}...")
        sheet_name = sector_id[:31]
        
        df_rec_table = generate_offload_ranking_table(sector_id, df_nbr_study, avg_qoffset_per_sector_pair)
        if df_rec_table.empty: df_rec_table = pd.DataFrame([{"Status": "No suitable offload candidates found."}])
        df_rec_table.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1)
        
        worksheet = writer.sheets[sheet_name]
        worksheet.write_string(0, 0, f"Recommendation Report for Sector: {sector_id}")
        
        # Pass the qoffset data to the visualization function to be used in edge labels
        fig = create_neighborhood_visualization(sector_id, G_sector, df_hierarchy, df_cell_nodes, avg_qoffset_per_sector_pair)
        if fig is not None:
            img_buffer = BytesIO()
            fig.savefig(img_buffer, format='png', bbox_inches='tight')
            img_buffer.seek(0)
            worksheet.insert_image(f'A{len(df_rec_table) + 4}', 'plot.png', {'image_data': img_buffer})
            plt.close(fig)

print(f"\nSUCCESS: Complete recommendation report saved to '{output_excel_path}'")

--- Loading All Necessary Data Files ---
All necessary files loaded successfully.

--- Enhancing Features for Report ---
Merged ET data into cell node features.
Calculated average Qoffset for sector-to-sector relationships.

--- Identifying Problematic Sectors ---
Found 170 unique problematic sectors.

--- Generating Full Report for 170 sectors ---
Creating introduction sheet with hyperlinks...
Processing sheet for sector: LH1000XA...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Processing sheet for sector: LH1000XC...
Processing sheet for sector: LH1000XD...
Processing sheet for sector: LH1001XB...
Processing sheet for sector: LH1002XE...
Processing sheet for sector: LH1002XG...
Processing sheet for sector: LH1007XB...
Processing sheet for sector: LH1012XB...
Processing sheet for sector: LH1012XC...
Processing sheet for sector: LH1024XA...
Processing sheet for sector: LH1143XA...
Processing sheet for sector: LH1143XC...
Processing sheet for sector: LH1197XB...
Processing sheet for sector: LH1200XC...
Processing sheet for sector: LH1210XA...
Processing sheet for sector: LH1210XE...
Processing sheet for sector: LH1228XA...
Processing sheet for sector: LH1228XC...
Processing sheet for sector: LH1271XC...
Processing sheet for sector: LH1375XB...
Processing sheet for sector: LH1383XD...
Processing sheet for sector: LH1393XA...
Processing sheet for sector: LH1454XB...
Processing sheet for sector: LH1454XC...
Processing sheet for sector: LH1457XC...
Processing sheet