This code groups each drug with all the interaction types it appears in.
For every row in the dataset, the interaction label is added to both drugs in the pair.
Then the interaction types for each drug are combined, duplicates are removed, and the final list is sorted and saved.
The result is one row per drug showing all interaction types linked to that drug.

In [None]:
import pandas as pd
from collections import defaultdict

from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/MLHygnn/DB/DDI_unique_interactionsAnalysis.csv'
df = pd.read_csv(file_path)

drug1_interactions = defaultdict(set)
drug2_interactions = defaultdict(set)

# Process each row to collect interaction types for each drug
for _, row in df.iterrows():
    drug1 = row['Drug1_ID']
    drug2 = row['Drug2_ID']
    label = row['Label']

    # Add interaction type to both drugs
    drug1_interactions[drug1].add(label)
    drug2_interactions[drug2].add(label)

# Combine both dictionaries to get all drugs and their interaction types
all_drug_interactions = defaultdict(set)

# Add from drug1 interactions
for drug, interactions in drug1_interactions.items():
    all_drug_interactions[drug].update(interactions)

# Add from drug2 interactions
for drug, interactions in drug2_interactions.items():
    all_drug_interactions[drug].update(interactions)

 # Create the final DataFrame with proper format
result_data = []
for drug_id, interaction_types in all_drug_interactions.items():
    # Sort interaction types for consistency and join with semicolon
    interactions_str = ';'.join(sorted(map(str, interaction_types)))
    # Format as "Drug_ID,Interaction_Types" in one column initially
    result_data.append([drug_id, interactions_str])

# Create DataFrame with proper column names
result_df = pd.DataFrame(result_data, columns=['Drug_ID', 'Interaction_Types'])
result_df = result_df.sort_values('Drug_ID').reset_index(drop=True)



# Save to CSV
output_path ='/content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/drug_interaction_types.csv'
result_df.to_csv(output_path, index=False,quoting=1)

# Display summary information
print(f"Processing complete!")
print(f"Total unique drugs: {len(result_df)}")
print(f"Output saved to: {output_path}")
print("\nFirst 10 rows:")
print(result_df.head(10))

# Display some statistics
interaction_counts = result_df['Interaction_Types'].str.split(';').str.len()
print(f"\nInteraction type statistics:")
print(f"Min interaction types per drug: {interaction_counts.min()}")
print(f"Max interaction types per drug: {interaction_counts.max()}")
print(f"Average interaction types per drug: {interaction_counts.mean():.2f}")

# Show example of a drug with multiple interaction types
multi_interaction_drug = result_df[interaction_counts > 1].iloc[0] if any(interaction_counts > 1) else None
if multi_interaction_drug is not None:
    print(f"\nExample drug with multiple interaction types:")
    print(f"Drug: {multi_interaction_drug['Drug_ID']}")
    print(f"Interaction types: {multi_interaction_drug['Interaction_Types']}")

Processing complete!
Total unique drugs: 1709
Output saved to: /content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/drug_interaction_types.csv

First 10 rows:
   Drug_ID                                  Interaction_Types
0  DB00006                           12;4;47;49;6;66;70;73;75
1  DB00014                                     15;20;49;58;70
2  DB00027                                           70;73;77
3  DB00035                                              49;70
4  DB00080                                              49;70
5  DB00091  11;13;18;21;34;35;4;42;47;49;57;67;68;70;72;73...
6  DB00104                    15;20;25;47;49;54;58;70;73;75;9
7  DB00115                                              70;75
8  DB00120                                              34;49
9  DB00122                                              70;73

Interaction type statistics:
Min interaction types per drug: 1
Max interaction types per drug: 25
Average interaction types per drug: 7.89

Example drug with 

# DATA VALIDATION

In [None]:
import pandas as pd
from collections import defaultdict, Counter

# Load both files
original_file = '/content/drive/MyDrive/MLHygnn/DB/DDI_unique_interactionsAnalysis.csv'
processed_file ='/content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/drug_interaction_types.csv'
original_df = pd.read_csv(original_file)
processed_df = pd.read_csv(processed_file)

print("=== DATA VALIDATION REPORT ===\n")

# 1. Basic statistics comparison
print("1. BASIC STATISTICS:")
print(f"Original file rows: {len(original_df)}")
print(f"Original unique Drug1_IDs: {original_df['Drug1_ID'].nunique()}")
print(f"Original unique Drug2_IDs: {original_df['Drug2_ID'].nunique()}")

# Get all unique drugs from original data
original_drugs = set(original_df['Drug1_ID']).union(set(original_df['Drug2_ID']))
print(f"Total unique drugs in original: {len(original_drugs)}")

print(f"Processed file rows: {len(processed_df)}")
processed_drugs = set(processed_df['Drug_ID'])
print(f"Total unique drugs in processed: {len(processed_drugs)}")

# 2. Check for missing drugs
missing_drugs = original_drugs - processed_drugs
extra_drugs = processed_drugs - original_drugs

print(f"\n2. DRUG COVERAGE:")
if missing_drugs:
    print(f"Missing drugs in processed file: {len(missing_drugs)}")
    print(f"First 5 missing: {list(missing_drugs)[:5]}")
else:
    print("All drugs from original data are present in processed file")

if extra_drugs:
    print(f" Extra drugs in processed file: {len(extra_drugs)}")
    print(f"First 5 extra: {list(extra_drugs)[:5]}")
else:
    print("No unexpected drugs in processed file")

# 3. Rebuild the drug-interaction mapping from original data
original_drug_interactions = defaultdict(set)
for _, row in original_df.iterrows():
    drug1 = row['Drug1_ID']
    drug2 = row['Drug2_ID']
    label = row['Label']

    original_drug_interactions[drug1].add(label)
    original_drug_interactions[drug2].add(label)

# 4. Parse processed data back to sets for comparison
processed_drug_interactions = {}
for _, row in processed_df.iterrows():
    drug_id = row['Drug_ID']
    interactions_str = str(row['Interaction_Types'])

    if pd.isna(row['Interaction_Types']) or interactions_str == 'nan':
        interactions_set = set()
    else:
        # Convert to integers for proper comparison
        interactions_set = set(map(int, interactions_str.split(';')))

    processed_drug_interactions[drug_id] = interactions_set

# 5. Compare interaction assignments for each drug
print(f"\n3. INTERACTION ASSIGNMENT VALIDATION:")
mismatches = []
total_interactions_original = 0
total_interactions_processed = 0

for drug in original_drugs:
    original_set = original_drug_interactions[drug]
    processed_set = processed_drug_interactions.get(drug, set())

    total_interactions_original += len(original_set)
    total_interactions_processed += len(processed_set)

    if original_set != processed_set:
        missing_in_processed = original_set - processed_set
        extra_in_processed = processed_set - original_set
        mismatches.append({
            'drug': drug,
            'missing': missing_in_processed,
            'extra': extra_in_processed,
            'original_count': len(original_set),
            'processed_count': len(processed_set)
        })

if mismatches:
    print(f"  Found {len(mismatches)} drugs with interaction mismatches")
    print("First 5 mismatches:")
    for i, mismatch in enumerate(mismatches[:5]):
        print(f"  Drug {mismatch['drug']}:")
        print(f"    Original: {len(mismatch['original_count'])} interactions")
        print(f"    Processed: {len(mismatch['processed_count'])} interactions")
        if mismatch['missing']:
            print(f"    Missing: {mismatch['missing']}")
        if mismatch['extra']:
            print(f"    Extra: {mismatch['extra']}")
else:
    print(" All drug-interaction assignments match perfectly")

# 6. Interaction type coverage
original_labels = set(original_df['Label'].unique())
all_processed_labels = set()
for interactions in processed_drug_interactions.values():
    all_processed_labels.update(interactions)

print(f"\n4. INTERACTION TYPES COVERAGE:")
print(f"Original interaction types: {len(original_labels)} ({sorted(original_labels)})")
print(f"Processed interaction types: {len(all_processed_labels)} ({sorted(all_processed_labels)})")

missing_labels = original_labels - all_processed_labels
extra_labels = all_processed_labels - original_labels

if missing_labels:
    print(f"  Missing interaction types: {missing_labels}")
else:
    print(" All interaction types preserved")

if extra_labels:
    print(f"  Extra interaction types: {extra_labels}")
else:
    print(" No unexpected interaction types")

# 7. Sample verification
print(f"\n5. SAMPLE VERIFICATION:")
sample_drugs = list(original_drugs)[:3]
for drug in sample_drugs:
    orig_interactions = sorted(original_drug_interactions[drug])
    proc_interactions = sorted(processed_drug_interactions.get(drug, set()))
    print(f"Drug {drug}:")
    print(f"  Original: {orig_interactions}")
    print(f"  Processed: {proc_interactions}")
    print(f"  Match: {'Yes' if orig_interactions == proc_interactions else 'No'}")

# 8. Summary
print(f"\n=== SUMMARY ===")
if not missing_drugs and not extra_drugs and not mismatches and not missing_labels and not extra_labels:
    print("VALIDATION PASSED: Data transformation is perfect!")
    print("   - All drugs preserved")
    print("   - All interactions correctly assigned")
    print("   - All interaction types maintained")
else:
    print(" VALIDATION ISSUES DETECTED:")
    if missing_drugs:
        print(f"   - {len(missing_drugs)} drugs missing")
    if extra_drugs:
        print(f"   - {len(extra_drugs)} unexpected drugs")
    if mismatches:
        print(f"   - {len(mismatches)} drugs with incorrect interactions")
    if missing_labels or extra_labels:
        print(f"   - Interaction type inconsistencies")

=== DATA VALIDATION REPORT ===

1. BASIC STATISTICS:
Original file rows: 191877
Original unique Drug1_IDs: 1599
Original unique Drug2_IDs: 1639
Total unique drugs in original: 1709
Processed file rows: 1709
Total unique drugs in processed: 1709

2. DRUG COVERAGE:
âœ… All drugs from original data are present in processed file
âœ… No unexpected drugs in processed file

3. INTERACTION ASSIGNMENT VALIDATION:
âœ… All drug-interaction assignments match perfectly

4. INTERACTION TYPES COVERAGE:
Original interaction types: 86 ([np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int6

# Summary typer-interaction

In [None]:

# Load the data files
original_file = '/content/drive/MyDrive/MLHygnn/DB/DDI_unique_interactionsAnalysis.csv'
processed_file = '/content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/drug_interaction_types.csv'

original_df = pd.read_csv(original_file)
processed_df = pd.read_csv(processed_file)

# Analysis from original data (interaction frequency)
interaction_counts = original_df['Label'].value_counts().sort_index()
total_interactions = len(original_df)

# Analysis from processed data (unique drugs per interaction type)
interaction_drug_count = defaultdict(set)

for _, row in processed_df.iterrows():
    drug_id = row['Drug_ID']
    interactions_str = str(row['Interaction_Types'])

    if pd.notna(row['Interaction_Types']) and interactions_str != 'nan':
        interaction_types = list(map(int, interactions_str.split(';')))
        for interaction_type in interaction_types:
            interaction_drug_count[interaction_type].add(drug_id)

# Convert to counts
drug_counts_per_type = {k: len(v) for k, v in interaction_drug_count.items()}

# Main analysis CSV
analysis_data = []
for interaction_type in sorted(interaction_counts.index):
    interactions = interaction_counts[interaction_type]
    unique_drugs = drug_counts_per_type.get(interaction_type, 0)
    avg_per_drug = interactions / unique_drugs if unique_drugs > 0 else 0
    percentage = (interactions / total_interactions) * 100

    analysis_data.append({
        'Interaction_Type_ID': interaction_type,
        'Total_Interactions': interactions,
        'Unique_Drugs': unique_drugs,
        'Avg_Interactions_Per_Drug': round(avg_per_drug, 2),
        'Percentage_of_Total': round(percentage, 2)
    })

analysis_df = pd.DataFrame(analysis_data)
analysis_output = '/content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/summarytyperinter.csv'
analysis_df.to_csv(analysis_output, index=False)

print(f"Analysis saved to: {analysis_output}")
print(f"Rows saved: {len(analysis_df)}")

Analysis saved to: /content/drive/MyDrive/MLHygnn/DB/OutPutPreprosseing/summarytyperinter.csv
Rows saved: 86
