<a href="https://colab.research.google.com/github/MehrdadJalali-KIT/BlackHole/blob/main/BlackHole_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount drive
from google.colab import drive
import os

drive.mount('/content/drive')
# Change working path
os.chdir('/content/drive/MyDrive/Research/MOF/Black_Hole')

Mounted at /content/drive


In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [6]:
import pandas as pd
import numpy as np
import networkx as nx
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Dense, Dropout, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import CategoricalCrossentropy
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.metrics import confusion_matrix, cohen_kappa_score, matthews_corrcoef, roc_auc_score
import seaborn as sns
from sklearn.metrics import confusion_matrix
import random
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from tensorflow.keras.models import load_model
import warnings
from rdkit import RDLogger
from tensorflow.keras import models, layers, regularizers
from sklearn.metrics import accuracy_score
import time
from tensorflow.keras.callbacks import EarlyStopping

# Suppress specific deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Additionally, suppress RDKit warnings globally
RDLogger.DisableLog('rdApp.*')

def generate_fingerprint(smiles):
    """Generates a molecular fingerprint given a SMILES string."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros((1024,), dtype=float)  # Return an array of zeros if molecule can't be parsed
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024), dtype=float)
    except Exception as e:
        print(f"SMILES Parse Error: {e}")
        return np.zeros((1024,), dtype=float)  # Return an array of zeros in case of an error

def plot_confusion_matrix(y_true, y_pred, classes):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate percentage accuracy for each element in the confusion matrix
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    # Combine counts and percentages for display
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_percentage[i, j]
            annot[i, j] = f'{c}\n({p:.1f}%)'  # Count and percentage

    # Plot the confusion matrix with annotations
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=annot, fmt='', cmap='Blues', xticklabels=classes, yticklabels=classes, cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

def label_encode_metal_names(metal_names):
    """Encodes metal names as integers."""
    metal_dict = {metal: idx for idx, metal in enumerate(np.unique(metal_names))}
    return np.array([metal_dict[metal] for metal in metal_names])

def preprocess_graph(graph, features):
    # Determine the dimensionality of the feature vectors
    feature_dimension = features.shape[1]

    # Convert the graph to an adjacency matrix
    adjacency_matrix = nx.adjacency_matrix(graph).toarray()

    # Initialize an empty list to store feature vectors
    feature_vectors = []

    # Create a mapping from node labels to integer indices
    node_to_index = {node: index for index, node in enumerate(graph.nodes())}

    # Iterate over nodes in the graph
    for node in graph.nodes():
        # Get the integer index corresponding to the node label
        node_index = node_to_index[node]
        # Check if the node index is valid
        if node_index < len(features):
            # Append the feature vector corresponding to the node index
            feature_vectors.append(features[node_index])
        else:
            # If the node index is out of range, assign a default feature vector
            feature_vectors.append(np.zeros((feature_dimension,)))

    # Convert the list of feature vectors to a numpy array
    feature_matrix = np.array(feature_vectors)

    return adjacency_matrix, feature_matrix

def build_gcn_model(input_shape_feature, input_shape_adjacency, num_classes):
    # Define input layers
    x_inp_feature = Input(shape=(input_shape_feature,))
    x_inp_adjacency = Input(shape=(input_shape_adjacency,))

    # Feature processing with multiple layers
    x_feature = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(x_inp_feature)
    x_feature = Dropout(0.5)(x_feature)
    x_feature = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x_feature)
    x_feature = Dropout(0.3)(x_feature)

    # Adjacency processing with multiple layers
    x_adjacency = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(x_inp_adjacency)
    x_adjacency = Dropout(0.5)(x_adjacency)
    x_adjacency = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x_adjacency)
    x_adjacency = Dropout(0.3)(x_adjacency)

    # Concatenate feature and adjacency outputs
    x = concatenate([x_feature, x_adjacency])

    # Output layer
    output = Dense(num_classes, activation='softmax')(x)

    # Create model
    model = Model(inputs=[x_inp_feature, x_inp_adjacency], outputs=output)

    # Using a smaller learning rate
    optimizer = Adam(learning_rate=0.0009)

    # Compile model
    model.compile(optimizer=optimizer, loss=CategoricalCrossentropy(), metrics=['accuracy'])

    return model

def build_feedforward_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

def train_gcn_model(model, adjacency_matrix, feature_matrix, labels, epochs, batch_size):
    if model is not None and adjacency_matrix is not None and feature_matrix is not None and labels is not None:
        # Early stopping to prevent overfitting
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        # ModelCheckpoint to save the best model
        model_checkpoint = ModelCheckpoint('best_gcn_model.keras', monitor='val_loss', save_best_only=True, verbose=1)

        start_time = time.time()
        # Train the model
        history = model.fit([feature_matrix, adjacency_matrix], labels,
                            epochs=epochs, batch_size=batch_size,
                            validation_split=0.2, callbacks=[early_stopping, model_checkpoint])
        end_time = time.time()

        # Calculate total training time
        total_training_time = end_time - start_time
        print(f"Total training time: {total_training_time:.2f} seconds")

        return history
    else:
        print("Error: One or more input arguments to train_gcn_model is None.")


from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    edge_list_filenames = [
        'sparsified_graph_edges_blackhole_0.1.csv',
        'sparsified_graph_edges_blackhole_0.2.csv',
        'sparsified_graph_edges_blackhole_0.3.csv',
        'sparsified_graph_edges_blackhole_0.4.csv',
        'sparsified_graph_edges_blackhole_0.5.csv',
        'sparsified_graph_edges_blackhole_0.6.csv',
        'sparsified_graph_edges_blackhole_0.7.csv',
        'sparsified_graph_edges_blackhole_0.8.csv',
        'sparsified_graph_edges_blackhole_0.9.csv'
    ]

    summary_data_filename = '1M1L3D_summary.csv'

    # Initialize lists to track accuracies and thresholds
    accuracies = []
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

    # Loop through all edge list files
    for edges_list_filename, threshold in zip(edge_list_filenames, thresholds):
        print(f"Processing edge list file: {edges_list_filename}")

        # Load data
        edges_list = pd.read_csv(edges_list_filename, header=None, names=['source', 'target', 'weight'], delimiter=' ')
        summary_data = pd.read_csv(summary_data_filename)

        node_labels_source = edges_list['source'].astype(str).unique()
        node_labels_target = edges_list['target'].astype(str).unique()
        node_labels = np.unique(np.concatenate((node_labels_source, node_labels_target)))
        node_labels = list(set(node_labels))

        print("Unique node labels:", len(node_labels))

        summary_data_filtered = summary_data[summary_data['refcode'].isin(node_labels)]
        print("Filtered summary data:\n", len(summary_data_filtered))

        if not summary_data_filtered.empty:
            linker_smiles = summary_data_filtered['linker SMILES']
            if not linker_smiles.empty:
                # Generate features
                linker_features = np.stack(linker_smiles.dropna().apply(generate_fingerprint).values)
                metal_names = summary_data_filtered['metal']
                metal_features = label_encode_metal_names(metal_names).reshape(-1, 1)

                other_features = summary_data_filtered[['Largest Cavity Diameter', 'Largest Free Sphere']].values.astype('float32')
                features = np.concatenate((linker_features, metal_features, other_features), axis=1)

                # Generate labels
                summary_data_filtered['PLD_category'] = pd.cut(
                    summary_data_filtered['Pore Limiting Diameter'],
                    bins=[-np.inf, 2.4, 4.4, 5.9, np.inf],
                    labels=['nonporous', 'small pore', 'medium pore', 'large pore']
                )
                labels = pd.get_dummies(summary_data_filtered['PLD_category']).values

                # Split the data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=56)

                # Load the sparsified graph
                graph = nx.read_weighted_edgelist(edges_list_filename)

                # Preprocess the graph data
                adjacency_matrix, feature_matrix = preprocess_graph(graph, features)

                # Split the adjacency and feature matrices accordingly
                adj_train, adj_test, feat_train, feat_test = train_test_split(adjacency_matrix, feature_matrix, test_size=0.2, random_state=56)

                # Provide the number of classes
                num_classes = labels.shape[1]

                # Build the GCN model
                gcn_model = build_gcn_model(feat_train.shape[1], adj_train.shape[1], num_classes)

                # Train the GCN model
                history = train_gcn_model(gcn_model, adj_train, feat_train, y_train, epochs=40, batch_size=32)

                # Evaluate the model on the test set
                test_loss, test_accuracy = gcn_model.evaluate([feat_test, adj_test], y_test, verbose=0)
                print(f'Test Accuracy for threshold {threshold}: {test_accuracy}')

                # Track the accuracy
                accuracies.append(test_accuracy)

                # Continue with your evaluation metrics and comparison logic
                # ...
            else:
                print("Error: linker_smiles column is empty.")
        else:
            print("Error: summary_data_filtered DataFrame is empty.")

    # Plot the accuracy comparison
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, accuracies, marker='o', color='b', label='Test Accuracy')
    plt.xlabel('Threshold')
    plt.ylabel('Accuracy')
    plt.title('GCN Test Accuracy Across Different Sparsification Thresholds')
    plt.grid(True)
    plt.legend()
    plt.show()


Processing edge list file: sparsified_graph_edges_blackhole_0.1.csv
Unique node labels: 8810
Filtered summary data:
 8810


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_data_filtered['PLD_category'] = pd.cut(


Epoch 1/40
[1m176/177[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - accuracy: 0.4562 - loss: 4.0593
Epoch 1: val_loss improved from inf to 1.38494, saving model to best_gcn_model.keras
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.4571 - loss: 4.0437 - val_accuracy: 0.6376 - val_loss: 1.3849
Epoch 2/40
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6494 - loss: 1.3133
Epoch 2: val_loss improved from 1.38494 to 1.01015, saving model to best_gcn_model.keras
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.6495 - loss: 1.3128 - val_accuracy: 0.6631 - val_loss: 1.0101
Epoch 3/40
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.6805 - loss: 1.0059
Epoch 3: val_loss improved from 1.01015 to 0.86158, saving model to best_gcn_model.keras
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms

FileNotFoundError: [Errno 2] No such file or directory: 'sparsified_graph_edges_blackhole_0.2.csv'

**Evaluation on Cintinius PLD**

In [10]:
import pandas as pd
import numpy as np
import networkx as nx
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Dropout, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from rdkit import RDLogger

# Suppress specific deprecation warnings and RDKit warnings globally
warnings.filterwarnings("ignore", category=DeprecationWarning)
RDLogger.DisableLog('rdApp.*')

def generate_fingerprint(smiles):
    """Generates a molecular fingerprint given a SMILES string."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros((1024,), dtype=float)  # Return an array of zeros if molecule can't be parsed
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024), dtype=float)
    except Exception as e:
        print(f"SMILES Parse Error: {e}")
        return np.zeros((1024,), dtype=float)  # Return an array of zeros in case of an error

def preprocess_graph(graph, features):
    """Preprocesses graph data into adjacency and feature matrices."""
    feature_dimension = features.shape[1]
    adjacency_matrix = nx.adjacency_matrix(graph).toarray()
    feature_vectors = []

    node_to_index = {node: index for index, node in enumerate(graph.nodes())}
    for node in graph.nodes():
        node_index = node_to_index[node]
        if node_index < len(features):
            feature_vectors.append(features[node_index])
        else:
            feature_vectors.append(np.zeros((feature_dimension,)))

    feature_matrix = np.array(feature_vectors)
    return adjacency_matrix, feature_matrix

def build_gcn_model(input_shape_feature, input_shape_adjacency):
    """Builds a Graph Convolutional Network model for regression."""
    x_inp_feature = Input(shape=(input_shape_feature,))
    x_inp_adjacency = Input(shape=(input_shape_adjacency,))

    x_feature = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(x_inp_feature)
    x_feature = Dropout(0.5)(x_feature)
    x_feature = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x_feature)
    x_feature = Dropout(0.5)(x_feature)

    x_adjacency = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(x_inp_adjacency)
    x_adjacency = Dropout(0.5)(x_adjacency)
    x_adjacency = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x_adjacency)
    x_adjacency = Dropout(0.5)(x_adjacency)

    x = concatenate([x_feature, x_adjacency])
    output = Dense(1)(x)  # Single output node for regression

    model = Model(inputs=[x_inp_feature, x_inp_adjacency], outputs=output)
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss=MeanSquaredError(), metrics=[RootMeanSquaredError()])
    return model

def train_gcn_model(model, adjacency_matrix, feature_matrix, labels, epochs, batch_size):
    """Trains the GCN model and returns the training history."""
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint('best_gcn_model.keras', monitor='val_loss', save_best_only=True, verbose=1)

    history = model.fit([feature_matrix, adjacency_matrix], labels,
                        epochs=epochs, batch_size=batch_size,
                        validation_split=0.2, callbacks=[early_stopping, model_checkpoint])
    return history

if __name__ == "__main__":
    # Load your data
    edge_list_filename = 'sparsified_graph_edges_blackhole_0.1.csv'
    summary_data_filename = '1M1L3D_summary.csv'


    edges_list = pd.read_csv(edge_list_filename, header=None, names=['source', 'target', 'weight'], delimiter=' ')
    summary_data = pd.read_csv(summary_data_filename)

    node_labels_source = edges_list['source'].astype(str).unique()
    node_labels_target = edges_list['target'].astype(str).unique()
    node_labels = np.unique(np.concatenate((node_labels_source, node_labels_target)))

    summary_data_filtered = summary_data[summary_data['refcode'].isin(node_labels)]

    linker_smiles = summary_data_filtered['linker SMILES']
    linker_features = np.stack(linker_smiles.dropna().apply(generate_fingerprint).values)

    features = linker_features
    labels = summary_data_filtered['Pore Limiting Diameter'].values

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    graph = nx.read_weighted_edgelist(edge_list_filename)
    adjacency_matrix, feature_matrix = preprocess_graph(graph, features)

    adj_train, adj_test, feat_train, feat_test = train_test_split(adjacency_matrix, feature_matrix, test_size=0.2, random_state=42)

    model = build_gcn_model(feat_train.shape[1], adj_train.shape[1])
    history = train_gcn_model(model, adj_train, feat_train, y_train, epochs=50, batch_size=32)

    test_loss, test_rmse = model.evaluate([feat_test, adj_test], y_test, verbose=0)
    print(f'Test RMSE: {test_rmse}')


Epoch 1/50
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 23.7612 - root_mean_squared_error: 4.2479
Epoch 1: val_loss improved from inf to 13.58128, saving model to best_gcn_model.keras
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - loss: 23.7418 - root_mean_squared_error: 4.2460 - val_loss: 13.5813 - val_root_mean_squared_error: 3.0445
Epoch 2/50
[1m176/177[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - loss: 13.3102 - root_mean_squared_error: 3.0263
Epoch 2: val_loss improved from 13.58128 to 10.56723, saving model to best_gcn_model.keras
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - loss: 13.3053 - root_mean_squared_error: 3.0258 - val_loss: 10.5672 - val_root_mean_squared_error: 2.6190
Epoch 3/50
[1m176/177[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - loss: 11.6582 - root_mean_squared_error: 2.8310
Epoch 3: val_loss improved from 10.56723 to 

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
import warnings
from rdkit import RDLogger

# Suppress specific deprecation warnings and RDKit warnings globally
warnings.filterwarnings("ignore", category=DeprecationWarning)
RDLogger.DisableLog('rdApp.*')

def generate_fingerprint(smiles):
    """Generates a molecular fingerprint given a SMILES string."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros((1024,), dtype=float)
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024), dtype=float)
    except Exception as e:
        print(f"SMILES Parse Error: {e}")
        return np.zeros((1024,), dtype=float)

def load_and_preprocess_data(filename):
    """Load and preprocess data."""
    summary_data = pd.read_csv(filename)
    summary_data_filtered = summary_data.dropna(subset=['linker SMILES', 'Pore Limiting Diameter'])

    linker_smiles = summary_data_filtered['linker SMILES']
    features = np.stack(linker_smiles.apply(generate_fingerprint).values)
    labels = summary_data_filtered['Pore Limiting Diameter'].values

    return features, labels

def train_and_evaluate_model(features, labels, model):
    """Splits data, trains model, and evaluates it."""
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # Feature scaling for KNN
    if isinstance(model, KNeighborsRegressor):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

if __name__ == "__main__":
    features, labels = load_and_preprocess_data('filtered_summary_data.csv')

    # K-Nearest Neighbors Regressor
    knn_model = KNeighborsRegressor(n_neighbors=5)
    knn_rmse = train_and_evaluate_model(features, labels, knn_model)
    print(f'KNN RMSE: {knn_rmse}')

    # Gradient Boosting Trees Regressor
    gbtree_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    gbtree_rmse = train_and_evaluate_model(features, labels, gbtree_model)
    print(f'GBTree RMSE: {gbtree_rmse}')


KNN RMSE: 2.2503007682787586
GBTree RMSE: 2.1923753956354797


**Data Filtering Description**

Objective: Filter entries from the dataset 1M1L3D_summary.csv based on matching refcode values found in either the source or target columns of the edge list sparsified_graph_edges_blackhole_0.1.csv. The edge list file is formatted without headers and uses spaces as delimiters.



Read both datasets; the summary data with headers, and the edge list as a headerless space-delimited file.
Extract and combine unique refcode values from both source and target columns of the edge list.
Filter the summary data to retain only rows whose refcode is in the list of extracted unique codes.
Save the filtered data to a new CSV for further analysis.
Outcome: This process ensures the summary dataset only contains records relevant to the connections defined in the edge list, facilitating targeted data analysis.

In [12]:
import pandas as pd

# Load the data from the files
summary_data = pd.read_csv('1M1L3D_summary.csv')
edge_list = pd.read_csv('sparsified_graph_edges_blackhole_0.1.csv', delim_whitespace=True, header=None, names=['source', 'target', 'weight'])

# Find unique refcodes in both 'source' and 'target' columns of the edge list
unique_refcodes = pd.concat([edge_list['source'], edge_list['target']]).unique()

# Filter the summary data to keep only rows where 'refcode' is in the list of unique refcodes
filtered_summary_data = summary_data[summary_data['refcode'].isin(unique_refcodes)]

# Save the filtered data to a new CSV file
filtered_summary_data.to_csv('filtered_summary_data.csv', index=False)

