#Preprocessing of data to eliminate redundancy for interaction or reverse iteration

In [None]:
# Colab: mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import pandas as pd

# File paths
INPUT_CSV = '/content/drive/MyDrive/MLHygnn/DB/DeepDDI-DrunkBunk.csv'
OUT_DIR = '/content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing'

os.makedirs(OUT_DIR, exist_ok=True)

# Load and validate data
df = pd.read_csv(INPUT_CSV, dtype=str)
if not all(col in df.columns for col in ['Drug1', 'Drug2']):
    raise ValueError("Missing required columns: Drug1, Drug2")

# Add Label column if missing
if 'Label' not in df.columns:
    df['Label'] = pd.NA

# Clean data
for col in ['Drug1', 'Drug2', 'Label']:
    df[col] = df[col].astype(str).str.strip()

# Count original stats
original_count = len(df)
unique_drugs = set(df['Drug1'].tolist() + df['Drug2'].tolist())
num_unique_drugs = len(unique_drugs)

print(f"Original dataset: {original_count} interactions")
print(f"Number of unique drugs: {num_unique_drugs}")

# Remove self-loops (Drug1 == Drug2)
self_loops = df[df['Drug1'] == df['Drug2']]
df_no_self = df[df['Drug1'] != df['Drug2']].copy()
num_self_loops = len(self_loops)

# Create canonical pairs: always put drugs in alphabetical order
df_no_self['DrugA'] = df_no_self[['Drug1', 'Drug2']].min(axis=1) # min() always picks the alphabetically first drug
df_no_self['DrugB'] = df_no_self[['Drug1', 'Drug2']].max(axis=1) # max() always picks the alphabetically second drug

# Remove duplicates based on canonical pairs
unique_df = df_no_self.drop_duplicates(subset=['DrugA', 'DrugB'], keep='first')
unique_df = unique_df[['DrugA', 'DrugB', 'Label']].reset_index(drop=True)

# Calculate removal stats
num_removed_duplicates = len(df_no_self) - len(unique_df)
total_removed = num_self_loops + num_removed_duplicates

# Save results
unique_out = os.path.join(OUT_DIR, 'unique_interactions2.csv')
unique_df.to_csv(unique_out, index=False)

# Print summary statistics
print(f"\n--- SUMMARY ---")
print(f"Original interactions: {original_count}")
print(f"Self-loops removed: {num_self_loops}")
print(f"Duplicate pairs removed: {num_removed_duplicates}")
print(f"Total removed: {total_removed}")
print(f"Final unique pairs: {len(unique_df)}")
print(f"Unique drugs involved: {num_unique_drugs}")
print(f"\nSaved to: {unique_out}")

# Optional: Save removed interactions for audit
if num_self_loops > 0:
    self_loops['reason'] = 'self_loop'
    removed_out = os.path.join(OUT_DIR, 'removed_interactions.csv')
    self_loops[['Drug1', 'Drug2', 'Label', 'reason']].to_csv(removed_out, index=False)
    print(f"Removed interactions saved to: {removed_out}")





Original dataset: 192283 interactions
Number of unique drugs: 1709

--- SUMMARY ---
Original interactions: 192283
Self-loops removed: 0
Duplicate pairs removed: 406
Total removed: 406
Final unique pairs: 191877
Unique drugs involved: 1709

Saved to: /content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/unique_interactions2.csv
