
# **Generate negative**
(non-interacting) drug-drug pairs with proper exclusions.

Parameters

----------
df : pandas.DataFrame
    Input dataframe with columns:
*  'Drug1_ID'
*  'Drug2_ID'
*  'Label'  (1 = positive / interacting, 0 = negative / non-interacting)

excluded_pairs : set[tuple]
    Set of (Drug1_ID, Drug2_ID) pairs that were already used as negative samples
    in previous runs and must NOT be reused here.

all_positive_pairs : set[tuple]
    Set of ALL known positive (interacting) pairs (Drug1_ID, Drug2_ID)
    from the entire dataset. These are excluded to avoid generating
    false negatives.

num_negative_samples : int | None, optional
   *  Number of negative samples to generate:

* If an integer is provided, generates exactly that many pairs (if possible).
* If None, generates the same number of negatives as positive samples in `df`.

random_state : int | None, optional
   * Random seed for reproducibility.

# **We use different seed to splitting data set of negative ex: positive use 32 so for negative use other value like 45,35,etc**

## Returns

neg_df : pandas.DataFrame
    Dataframe containing only negative samples with:
        - 'Drug1_ID'
        - 'Drug2_ID'
        - 'Label' = 0

generated_pairs : set[tuple]
    Set of (Drug1_ID, Drug2_ID) pairs that were generated as negatives
    in this call. Can be merged into `excluded_pairs` in subsequent runs.



In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.utils import shuffle
import random
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

def generate_negative_samples_fixed(df, excluded_pairs=None, all_positive_pairs=None,
                                   num_negative_samples=None, random_state=34):


    np.random.seed(random_state)
    random.seed(random_state)

    # Get unique drugs from current split
    unique_drugs = set(df['Drug1_ID'].unique()) | set(df['Drug2_ID'].unique())
    unique_drugs = list(unique_drugs)

    # Initialize excluded pairs if not provided
    if excluded_pairs is None:
        excluded_pairs = set()

    if all_positive_pairs is None:
        all_positive_pairs = set()

    # Generate all possible drug pairs
    all_possible_pairs = list(combinations(unique_drugs, 2))

    # Filter out existing positive pairs AND excluded negative pairs
    negative_candidates = []
    for drug1, drug2 in all_possible_pairs:
        # Check both directions for exclusion
        if ((drug1, drug2) not in all_positive_pairs and
            (drug2, drug1) not in all_positive_pairs and
            (drug1, drug2) not in excluded_pairs and
            (drug2, drug1) not in excluded_pairs):
            negative_candidates.append((drug1, drug2))

    # Determine number of negative samples to generate
    if num_negative_samples is None:
        num_negative_samples = len(df)

    # Check if we have enough negative candidates
    if len(negative_candidates) < num_negative_samples:
        print(f"Warning: Only {len(negative_candidates)} negative candidates available, "
              f"but {num_negative_samples} requested.")
        num_negative_samples = len(negative_candidates)

    # Randomly sample negative pairs
    selected_negative_pairs = random.sample(negative_candidates, num_negative_samples)

    # Create negative samples DataFrame
    negative_df = pd.DataFrame(selected_negative_pairs,
                              columns=['Drug1_ID', 'Drug2_ID'])
    negative_df['Label'] = 0  # 0 indicates no interaction

    # Return negative pairs as set for exclusion in next splits
    negative_pairs_set = set()
    for drug1, drug2 in selected_negative_pairs:
        negative_pairs_set.add((drug1, drug2))
        negative_pairs_set.add((drug2, drug1))

    return negative_df, negative_pairs_set


In [3]:

def collect_all_positive_pairs(train_path, val_path, test_path):

    print(" Collecting all positive interactions from all splits...")

    # Load all positive files
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)

    all_positive_pairs = set()

    # Collect from all splits
    for df in [train_df, val_df, test_df]:
        for _, row in df.iterrows():
            drug1, drug2 = row['Drug1_ID'], row['Drug2_ID']
            # Add both directions since interactions are symmetric
            all_positive_pairs.add((drug1, drug2))
            all_positive_pairs.add((drug2, drug1))

    print(f" Collected {len(all_positive_pairs) // 2} unique positive interactions")
    return all_positive_pairs


In [None]:

def process_sequential_negative_sampling_fixed(train_path, val_path, test_path,
                                             output_dir, random_state=35):


    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    print("="*60)
    print("FIXED SEQUENTIAL NEGATIVE SAMPLING FOR DDI DATASET")
    print("="*60)

    # STEP 0: Collect ALL positive pairs from entire dataset
    all_positive_pairs = collect_all_positive_pairs(train_path, val_path, test_path)

    # Keep track of all used negative pairs
    all_used_negative_pairs = set()

    # Step 1: Process training split
    print("\n1. Processing TRAINING split...")
    train_df = pd.read_csv(train_path)
    print(f"   Original positive samples: {len(train_df)}")
    print(f"   Unique drugs: {len(set(train_df['Drug1_ID'].unique()) | set(train_df['Drug2_ID'].unique()))}")

    train_negative_df, train_negative_pairs = generate_negative_samples_fixed(
        train_df,
        excluded_pairs=all_used_negative_pairs,
        all_positive_pairs=all_positive_pairs,  # This is the key fix!
        random_state=random_state
    )

    print(f"   Generated negative samples: {len(train_negative_df)}")

    # Update used negative pairs
    all_used_negative_pairs.update(train_negative_pairs)

    # Save negative samples file
    train_output_path = os.path.join(output_dir, 'train_negatives.csv')
    train_negative_df.to_csv(train_output_path, index=False)
    print(f"   Saved: {train_output_path}")

    # Step 2: Process validation split
    print("\n2. Processing VALIDATION split...")
    val_df = pd.read_csv(val_path)
    print(f"   Original positive samples: {len(val_df)}")
    print(f"   Unique drugs: {len(set(val_df['Drug1_ID'].unique()) | set(val_df['Drug2_ID'].unique()))}")

    val_negative_df, val_negative_pairs = generate_negative_samples_fixed(
        val_df,
        excluded_pairs=all_used_negative_pairs,
        all_positive_pairs=all_positive_pairs,  # This prevents positive-negative conflicts!
        random_state=random_state + 1
    )

    print(f"   Generated negative samples: {len(val_negative_df)}")

    # Update used negative pairs
    all_used_negative_pairs.update(val_negative_pairs)

    # Save negative samples file
    val_output_path = os.path.join(output_dir, 'val_negatives.csv')
    val_negative_df.to_csv(val_output_path, index=False)
    print(f"   Saved: {val_output_path}")

    # Step 3: Process test split
    print("\n3. Processing TEST split...")
    test_df = pd.read_csv(test_path)
    print(f"   Original positive samples: {len(test_df)}")
    print(f"   Unique drugs: {len(set(test_df['Drug1_ID'].unique()) | set(test_df['Drug2_ID'].unique()))}")

    test_negative_df, test_negative_pairs = generate_negative_samples_fixed(
        test_df,
        excluded_pairs=all_used_negative_pairs,
        all_positive_pairs=all_positive_pairs,  # This prevents positive-negative conflicts!
        random_state=random_state + 2
    )

    print(f"   Generated negative samples: {len(test_negative_df)}")

    # Save negative samples file
    test_output_path = os.path.join(output_dir, 'test_negatives.csv')
    test_negative_df.to_csv(test_output_path, index=False)
    print(f"   Saved: {test_output_path}")

    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Training negative samples:   {len(train_negative_df)}")
    print(f"Validation negative samples: {len(val_negative_df)}")
    print(f"Test negative samples:       {len(test_negative_df)}")
    print(f"\nTotal positive pairs excluded: {len(all_positive_pairs) // 2}")
    print(f"Total negative pairs used: {len(all_used_negative_pairs) // 2}")
    print("\nFixed sequential negative sampling completed successfully!")
    print(" NO positive-negative conflicts possible!")
    print(" NO negative-negative leakage between splits!")

# Example usage:
if __name__ == "__main__":
    # Define file paths
    train_path = '/content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/train.csv'
    val_path = '/content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/val.csv'
    test_path = '/content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/test.csv'

    # Define output directory
    output_dir = '/content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/NegativeDS'

    # Run fixed sequential negative sampling
    process_sequential_negative_sampling_fixed(
        train_path=train_path,
        val_path=val_path,
        test_path=test_path,
        output_dir=output_dir,
        random_state=45
    )

FIXED SEQUENTIAL NEGATIVE SAMPLING FOR DDI DATASET
 Collecting all positive interactions from all splits...
 Collected 191877 unique positive interactions

1. Processing TRAINING split...
   Original positive samples: 153501
   Unique drugs: 1709
   Generated negative samples: 153501
   Saved: /content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/NegativeDS/train_negatives.csv

2. Processing VALIDATION split...
   Original positive samples: 19187
   Unique drugs: 1534
   Generated negative samples: 19187
   Saved: /content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/NegativeDS/val_negatives.csv

3. Processing TEST split...
   Original positive samples: 19189
   Unique drugs: 1517
   Generated negative samples: 19189
   Saved: /content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed32/NegativeDS/test_negatives.csv

SUMMARY
Training negative samples:   153501
Validation negative samples: 19187
Test negative samples:       19189

Total positive pairs exclud

In [None]:
def check_data_leakage_comprehensive(seed=32):

    print("="*80)
    print(f"COMPREHENSIVE DATA LEAKAGE CHECK - SEED {seed}")
    print("="*80)

    base_path = f'/content/drive/MyDrive/MLHygnn/DB/Partition-Dataset4/train-seed{seed}'
     # Define file paths
    train_pos_path = f'{base_path}/train.csv'
    val_pos_path = f'{base_path}/val.csv'
    test_pos_path = f'{base_path}/test.csv'

    train_neg_path = f'{base_path}/NegativeDS/train_negatives.csv'
    val_neg_path = f'{base_path}/NegativeDS/val_negatives.csv'
    test_neg_path = f'{base_path}/NegativeDS/test_negatives.csv'

    print(" Loading all files...")

    # Load positive files
    train_pos = pd.read_csv(train_pos_path)
    val_pos = pd.read_csv(val_pos_path)
    test_pos = pd.read_csv(test_pos_path)

    # Load negative files
    train_neg = pd.read_csv(train_neg_path)
    val_neg = pd.read_csv(val_neg_path)
    test_neg = pd.read_csv(test_neg_path)

    print(f" Loaded positive files: Train({len(train_pos)}), Val({len(val_pos)}), Test({len(test_pos)})")
    print(f" Loaded negative files: Train({len(train_neg)}), Val({len(val_neg)}), Test({len(test_neg)})")

    def create_interaction_pairs(df):
        """Create set of interaction pairs (both directions since interactions are symmetric)"""
        pairs = set()
        for _, row in df.iterrows():
            drug1, drug2 = row['Drug1_ID'], row['Drug2_ID']
            # Add both directions for symmetric interactions
            pairs.add((drug1, drug2))
            pairs.add((drug2, drug1))
        return pairs

    print(f"\n Creating interaction pair sets...")

    # Create interaction sets for positive files
    train_pos_pairs = create_interaction_pairs(train_pos)
    val_pos_pairs = create_interaction_pairs(val_pos)
    test_pos_pairs = create_interaction_pairs(test_pos)

    # Create interaction sets for negative files
    train_neg_pairs = create_interaction_pairs(train_neg)
    val_neg_pairs = create_interaction_pairs(val_neg)
    test_neg_pairs = create_interaction_pairs(test_neg)

    print(f" Created pair sets (including both directions)")

    # TEST 1: Check for overlapping positive interactions across splits
    print(f"\n" + "="*60)
    print("TEST 1: POSITIVE INTERACTION OVERLAP ACROSS SPLITS")
    print("="*60)

    train_val_pos_overlap = train_pos_pairs & val_pos_pairs
    train_test_pos_overlap = train_pos_pairs & test_pos_pairs
    val_test_pos_overlap = val_pos_pairs & test_pos_pairs

    print(f"Train ∩ Validation (Positive): {len(train_val_pos_overlap) // 2} interactions")
    print(f"Train ∩ Test (Positive): {len(train_test_pos_overlap) // 2} interactions")
    print(f"Validation ∩ Test (Positive): {len(val_test_pos_overlap) // 2} interactions")

    if train_val_pos_overlap or train_test_pos_overlap or val_test_pos_overlap:
        print(" POSITIVE LEAKAGE DETECTED!")
        if train_val_pos_overlap:
            sample_pairs = list(train_val_pos_overlap)[:5]
            print(f"   Sample Train-Val overlaps: {sample_pairs}")
        if train_test_pos_overlap:
            sample_pairs = list(train_test_pos_overlap)[:5]
            print(f"   Sample Train-Test overlaps: {sample_pairs}")
        if val_test_pos_overlap:
            sample_pairs = list(val_test_pos_overlap)[:5]
            print(f"   Sample Val-Test overlaps: {sample_pairs}")
    else:
        print(" NO POSITIVE LEAKAGE - All positive interactions are unique across splits")

    # TEST 2: Check for overlapping negative interactions across splits
    print(f"\n" + "="*60)
    print("TEST 2: NEGATIVE INTERACTION OVERLAP ACROSS SPLITS")
    print("="*60)

    train_val_neg_overlap = train_neg_pairs & val_neg_pairs
    train_test_neg_overlap = train_neg_pairs & test_neg_pairs
    val_test_neg_overlap = val_neg_pairs & test_neg_pairs

    print(f"Train ∩ Validation (Negative): {len(train_val_neg_overlap) // 2} interactions")
    print(f"Train ∩ Test (Negative): {len(train_test_neg_overlap) // 2} interactions")
    print(f"Validation ∩ Test (Negative): {len(val_test_neg_overlap) // 2} interactions")

    if train_val_neg_overlap or train_test_neg_overlap or val_test_neg_overlap:
        print(" NEGATIVE LEAKAGE DETECTED!")
        if train_val_neg_overlap:
            sample_pairs = list(train_val_neg_overlap)[:5]
            print(f"   Sample Train-Val overlaps: {sample_pairs}")
        if train_test_neg_overlap:
            sample_pairs = list(train_test_neg_overlap)[:5]
            print(f"   Sample Train-Test overlaps: {sample_pairs}")
        if val_test_neg_overlap:
            sample_pairs = list(val_test_neg_overlap)[:5]
            print(f"   Sample Val-Test overlaps: {sample_pairs}")
    else:
        print(" NO NEGATIVE LEAKAGE - All negative interactions are unique across splits")


    # TEST 3: Check for positive-negative conflicts within each split
    print(f"\n" + "="*60)
    print("TEST 3: POSITIVE vs NEGATIVE CONFLICTS WITHIN SPLITS")
    print("="*60)

    train_pos_neg_conflict = train_pos_pairs & train_neg_pairs
    val_pos_neg_conflict = val_pos_pairs & val_neg_pairs
    test_pos_neg_conflict = test_pos_pairs & test_neg_pairs

    print(f"Training: Positive ∩ Negative = {len(train_pos_neg_conflict) // 2} conflicts")
    print(f"Validation: Positive ∩ Negative = {len(val_pos_neg_conflict) // 2} conflicts")
    print(f"Test: Positive ∩ Negative = {len(test_pos_neg_conflict) // 2} conflicts")

    if train_pos_neg_conflict or val_pos_neg_conflict or test_pos_neg_conflict:
        print(" POSITIVE-NEGATIVE CONFLICTS DETECTED!")
        if train_pos_neg_conflict:
            sample_pairs = list(train_pos_neg_conflict)[:5]
            print(f"   Sample Training conflicts: {sample_pairs}")
        if val_pos_neg_conflict:
            sample_pairs = list(val_pos_neg_conflict)[:5]
            print(f"   Sample Validation conflicts: {sample_pairs}")
        if test_pos_neg_conflict:
            sample_pairs = list(test_pos_neg_conflict)[:5]
            print(f"   Sample Test conflicts: {sample_pairs}")
    else:
        print(" NO CONFLICTS - No interaction is both positive and negative within any split")

    # TEST 4: Check for positive-negative conflicts across all splits
    print(f"\n" + "="*60)
    print("TEST 4: POSITIVE vs NEGATIVE CONFLICTS ACROSS ALL SPLITS")
    print("="*60)

    all_pos_pairs = train_pos_pairs | val_pos_pairs | test_pos_pairs
    all_neg_pairs = train_neg_pairs | val_neg_pairs | test_neg_pairs

    global_pos_neg_conflict = all_pos_pairs & all_neg_pairs

    print(f"Total unique positive interactions: {len(all_pos_pairs) // 2}")
    print(f"Total unique negative interactions: {len(all_neg_pairs) // 2}")
    print(f"Global Positive ∩ Negative conflicts: {len(global_pos_neg_conflict) // 2}")

    if global_pos_neg_conflict:
        print(" GLOBAL CONFLICTS DETECTED!")
        sample_pairs = list(global_pos_neg_conflict)[:10]
        print(f"   Sample global conflicts: {sample_pairs}")
    else:
        print(" NO GLOBAL CONFLICTS - No interaction is both positive and negative anywhere")

    # TEST 5: Verification of sequential negative sampling
    print(f"\n" + "="*60)
    print("TEST 5: SEQUENTIAL NEGATIVE SAMPLING VERIFICATION")
    print("="*60)

    print(f"Expected behavior:")
    print(f"- Training negatives should not appear in validation or test negatives")
    print(f"- Training + validation negatives should not appear in test negatives")
    print(f"- All negative pairs should be mutually exclusive")

    # This is redundant with Test 2, but provides clear verification message
    total_negative_overlaps = (len(train_val_neg_overlap) + len(train_test_neg_overlap) +
                              len(val_test_neg_overlap)) // 2

    if total_negative_overlaps == 0:
        print(" SEQUENTIAL SAMPLING VERIFIED - Perfect exclusion maintained")
    else:
        print(f" SEQUENTIAL SAMPLING FAILED - {total_negative_overlaps} overlapping negative pairs")

    # FINAL SUMMARY
    print(f"\n" + "="*60)
    print("FINAL LEAKAGE CHECK SUMMARY")
    print("="*60)

    total_issues = 0

    if train_val_pos_overlap or train_test_pos_overlap or val_test_pos_overlap:
        print(" Positive interaction leakage across splits")
        total_issues += 1
    else:
        print(" No positive interaction leakage")

    if train_val_neg_overlap or train_test_neg_overlap or val_test_neg_overlap:
        print(" Negative interaction leakage across splits")
        total_issues += 1
    else:
        print(" No negative interaction leakage")

    if train_pos_neg_conflict or val_pos_neg_conflict or test_pos_neg_conflict:
        print(" Positive-negative conflicts within splits")
        total_issues += 1
    else:
        print(" No positive-negative conflicts within splits")

    if global_pos_neg_conflict:
        print(" Global positive-negative conflicts")
        total_issues += 1
    else:
        print("NO global positive-negative conflicts")

    print(f"\n FINAL VERDICT:")
    if total_issues == 0:
        print(" PERFECT - No data leakage detected anywhere!")
        print("Dataset is clean and ready for training.")
    else:
        print(f" ISSUES FOUND - {total_issues} types of data leakage detected")
        print("Need investigate and fix the issues before training.")

    return {
        'positive_leakage': bool(train_val_pos_overlap or train_test_pos_overlap or val_test_pos_overlap),
        'negative_leakage': bool(train_val_neg_overlap or train_test_neg_overlap or val_test_neg_overlap),
        'internal_conflicts': bool(train_pos_neg_conflict or val_pos_neg_conflict or test_pos_neg_conflict),
        'global_conflicts': bool(global_pos_neg_conflict),
        'total_issues': total_issues
    }


print(" Starting comprehensive data leakage verification...")
results = check_data_leakage_comprehensive(seed=32)

print(f"\n RESULTS SUMMARY:")
for key, value in results.items():
    print(f"   {key}: {value}")

print(f"\n Verification complete!")

 Starting comprehensive data leakage verification...
COMPREHENSIVE DATA LEAKAGE CHECK - SEED 32
 Loading all files...
 Loaded positive files: Train(153501), Val(19187), Test(19189)
 Loaded negative files: Train(153501), Val(19187), Test(19189)

 Creating interaction pair sets...
 Created pair sets (including both directions)

TEST 1: POSITIVE INTERACTION OVERLAP ACROSS SPLITS
Train ∩ Validation (Positive): 0 interactions
Train ∩ Test (Positive): 0 interactions
Validation ∩ Test (Positive): 0 interactions
 NO POSITIVE LEAKAGE - All positive interactions are unique across splits

TEST 2: NEGATIVE INTERACTION OVERLAP ACROSS SPLITS
Train ∩ Validation (Negative): 0 interactions
Train ∩ Test (Negative): 0 interactions
Validation ∩ Test (Negative): 0 interactions
 NO NEGATIVE LEAKAGE - All negative interactions are unique across splits

TEST 3: POSITIVE vs NEGATIVE CONFLICTS WITHIN SPLITS
Training: Positive ∩ Negative = 0 conflicts
Validation: Positive ∩ Negative = 0 conflicts
Test: Positive 