<a href="https://colab.research.google.com/github/Joy-Dorcas/Drug-Drug-Interaction-Prediction/blob/main/Drug_interaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install PyTDC



In [4]:
from google.colab import files
import pandas as pd
import json

# Upload files
uploaded = files.upload()  # Select all four files at once

# Load CSV files
comprehensive_ddi_dataset = pd.read_csv("comprehensive_ddi_dataset.csv")
drug_smiles_mapping = pd.read_csv("drug_smiles_mapping.csv")
drugbank_id_to_name = pd.read_csv("drugbank_id_to_name.csv")

# Load JSON file
with open("twosides_side_effect_labels.json", "r") as f:
    twosides_side_effect_labels = json.load(f)

# Optional: convert JSON to DataFrame if structured as a table
twosides_df = pd.json_normalize(twosides_side_effect_labels)

# Check loaded data
print(comprehensive_ddi_dataset.head())
print(drug_smiles_mapping.head())
print(drugbank_id_to_name.head())
print(twosides_df.head())


       Drug1_ID                                              Drug1  \
0  CID000002173  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...   
1  CID000002173  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...   
2  CID000002173  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...   
3  CID000002173  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...   
4  CID000002173  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...   

       Drug2_ID                                         Drug2    Y  \
0  CID000003345  CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3  767   
1  CID000003345  CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3   25   
2  CID000003345  CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3   85   
3  CID000003345  CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3  735   
4  CID000003345  CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3  959   

   Drug1_Name  Drug2_Name  
0         NaN         NaN  
1         NaN         NaN  
2         NaN         NaN  
3         NaN         NaN  
4         NaN     

In [5]:
comprehensive_ddi_dataset = comprehensive_ddi_dataset.merge(
    drugbank_id_to_name.rename(columns={'drugbank_id':'Drug1_ID','name':'Drug1_Name'}),
    on='Drug1_ID', how='left'
)
comprehensive_ddi_dataset = comprehensive_ddi_dataset.merge(
    drugbank_id_to_name.rename(columns={'drugbank_id':'Drug2_ID','name':'Drug2_Name'}),
    on='Drug2_ID', how='left'
)


In [6]:
side_effect_mapping = {col: twosides_df[col][0] for col in twosides_df.columns}
comprehensive_ddi_dataset['SideEffect'] = comprehensive_ddi_dataset['Y'].map(side_effect_mapping)


In [7]:
print(comprehensive_ddi_dataset.columns)


Index(['Drug1_ID', 'Drug1', 'Drug2_ID', 'Drug2', 'Y', 'Drug1_Name_x',
       'Drug2_Name_x', 'Drug1_Name_y', 'Drug2_Name_y', 'SideEffect'],
      dtype='object')


In [8]:
# Keep the correct drug name columns and drop duplicates
comprehensive_ddi_dataset.rename(
    columns={'Drug1_Name_y':'Drug1_Name', 'Drug2_Name_y':'Drug2_Name'}, inplace=True
)

# Optionally drop the extra columns
comprehensive_ddi_dataset.drop(columns=['Drug1_Name_x','Drug2_Name_x'], inplace=True)

# Check that it worked
print(comprehensive_ddi_dataset[['Drug1_ID', 'Drug1_Name', 'Drug2_ID', 'Drug2_Name', 'Y', 'SideEffect']].head(10))

# Check for missing values
print("Missing Drug1 names:", comprehensive_ddi_dataset['Drug1_Name'].isna().sum())
print("Missing Drug2 names:", comprehensive_ddi_dataset['Drug2_Name'].isna().sum())
print("Missing side effect mappings:", comprehensive_ddi_dataset['SideEffect'].isna().sum())


       Drug1_ID Drug1_Name      Drug2_ID Drug2_Name     Y SideEffect
0  CID000002173        NaN  CID000003345        NaN   767        NaN
1  CID000002173        NaN  CID000003345        NaN    25        NaN
2  CID000002173        NaN  CID000003345        NaN    85        NaN
3  CID000002173        NaN  CID000003345        NaN   735        NaN
4  CID000002173        NaN  CID000003345        NaN   959        NaN
5  CID000002173        NaN  CID000003345        NaN   255        NaN
6  CID000002173        NaN  CID000003345        NaN   740        NaN
7  CID000002173        NaN  CID000003345        NaN   815        NaN
8  CID000002173        NaN  CID000003345        NaN    56        NaN
9  CID000002173        NaN  CID000003345        NaN  1149        NaN
Missing Drug1 names: 3254609
Missing Drug2 names: 3254609
Missing side effect mappings: 3254609


In [9]:
print(f"✓ DDI Dataset: {len(comprehensive_ddi_dataset):,} records")
print(f"✓ SMILES Mapping: {len(drug_smiles_mapping):,} drugs")

print("\nSample of drug_smiles_mapping:")
print(drug_smiles_mapping.head())

✓ DDI Dataset: 3,254,609 records
✓ SMILES Mapping: 645 drugs

Sample of drug_smiles_mapping:
    drugbank_id                                             smiles
0  CID000002173  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
1  CID000005206                           C(OC(C(F)(F)F)C(F)(F)F)F
2  CID000003929      CC(=O)NCC1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F
3  CID000001302                CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O
4  CID000005267  CC(=O)SC1CC2=CC(=O)CCC2(C3C1C4CCC5(C4(CC3)C)CC...


In [10]:
# STEP 2: Create CID to SMILES Dictionary
print("\n[STEP 2] Creating CID to SMILES mapping...")

# Create dictionary mapping CID to SMILES
cid_to_smiles = dict(zip(
    drug_smiles_mapping['drugbank_id'],  # Actually CIDs like CID000002173
    drug_smiles_mapping['smiles']
))

print(f"✓ Created mapping for {len(cid_to_smiles):,} CID-SMILES pairs")

# Show a few examples
print("\nSample CID → SMILES mappings:")
for i, (cid, smiles) in enumerate(list(cid_to_smiles.items())[:3]):
    print(f"{i+1}. {cid}")
    print(f"   SMILES: {smiles[:60]}...")
    print()

# Verify the CIDs in your DDI dataset exist in this mapping
sample_drug1_ids = comprehensive_ddi_dataset['Drug1_ID'].head(5).tolist()
sample_drug2_ids = comprehensive_ddi_dataset['Drug2_ID'].head(5).tolist()

print("Checking if sample CIDs from DDI dataset exist in SMILES mapping:")
print(f"\nSample Drug1 IDs: {sample_drug1_ids}")
print(f"Sample Drug2 IDs: {sample_drug2_ids}")

matches_drug1 = sum(1 for cid in sample_drug1_ids if cid in cid_to_smiles)
matches_drug2 = sum(1 for cid in sample_drug2_ids if cid in cid_to_smiles)

print(f"\n✓ Drug1 matches: {matches_drug1}/5")
print(f"✓ Drug2 matches: {matches_drug2}/5")

# Check overall coverage
all_cids_in_ddi = set(comprehensive_ddi_dataset['Drug1_ID'].unique()) | set(comprehensive_ddi_dataset['Drug2_ID'].unique())
cids_with_smiles = set(cid_to_smiles.keys())
coverage = len(all_cids_in_ddi & cids_with_smiles) / len(all_cids_in_ddi) * 100


print(f"  - Unique CIDs in DDI dataset: {len(all_cids_in_ddi):,}")
print(f"  - CIDs with SMILES available: {len(cids_with_smiles):,}")
print(f"  - Coverage: {coverage:.1f}%")

if coverage < 50:
    print("  WARNING: Low coverage! Only {:.1f}% of CIDs have SMILES.".format(coverage))
    print("   This means most drug names won't be mapped.")
else:
    print(f"\n✓ Good coverage! {coverage:.1f}% of CIDs have SMILES available.")


[STEP 2] Creating CID to SMILES mapping...
✓ Created mapping for 645 CID-SMILES pairs

Sample CID → SMILES mappings:
1. CID000002173
   SMILES: CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)[O-])C...

2. CID000005206
   SMILES: C(OC(C(F)(F)F)C(F)(F)F)F...

3. CID000003929
   SMILES: CC(=O)NCC1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F...

Checking if sample CIDs from DDI dataset exist in SMILES mapping:

Sample Drug1 IDs: ['CID000002173', 'CID000002173', 'CID000002173', 'CID000002173', 'CID000002173']
Sample Drug2 IDs: ['CID000003345', 'CID000003345', 'CID000003345', 'CID000003345', 'CID000003345']

✓ Drug1 matches: 5/5
✓ Drug2 matches: 5/5
  - Unique CIDs in DDI dataset: 645
  - CIDs with SMILES available: 645
  - Coverage: 100.0%

✓ Good coverage! 100.0% of CIDs have SMILES available.


In [11]:
# STEP 3: Function to Get Drug Name from SMILES using PubChem

import requests
import time

print("\n[STEP 3] Setting up PubChem lookup function...")

def get_drug_name_from_smiles(smiles, max_retries=3):
    """
    Fetch drug name from PubChem using SMILES string

    Args:
        smiles: SMILES string representation of the molecule
        max_retries: Number of retry attempts if request fails

    Returns:
        Drug name (string) or None if not found
    """
    for attempt in range(max_retries):
        try:
            # Clean SMILES (remove any whitespace)
            smiles_clean = smiles.strip()

            # PubChem API endpoint
            url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles_clean}/property/Title/JSON"

            response = requests.get(url, timeout=10)

            if response.status_code == 200:
                data = response.json()
                name = data['PropertyTable']['Properties'][0]['Title']
                return name
            elif response.status_code == 404:
                # SMILES not found in PubChem
                return None
            else:
                # Other error, retry
                if attempt < max_retries - 1:
                    time.sleep(1)
                    continue
                return None

        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(1)
                continue
            return None

    return None

print("✓ Lookup function ready")

# Test the function with a sample SMILES
print("\nTesting function with sample SMILES...")
test_cid = 'CID000002173'
test_smiles = cid_to_smiles[test_cid]

print(f"\nTest CID: {test_cid}")
print(f"SMILES: {test_smiles[:60]}...")

print("\nFetching drug name from PubChem...")
test_name = get_drug_name_from_smiles(test_smiles)

if test_name:
    print(f"✓ SUCCESS! Drug name: {test_name}")
    print("\nThe function works correctly!")
else:
    print("✗ Could not retrieve drug name")
    print("This SMILES might not be in PubChem database")

print("Ready to fetch all drug names!")
print(f"We have {len(cid_to_smiles)} drugs to process")
print("Estimated time: ~3-5 minutes (with rate limiting)")


[STEP 3] Setting up PubChem lookup function...
✓ Lookup function ready

Testing function with sample SMILES...

Test CID: CID000002173
SMILES: CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)[O-])C...

Fetching drug name from PubChem...
✓ SUCCESS! Drug name: 6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-oxo-4-thia-1-azabicyclo[3.2.0]heptane-2-carboxylate

The function works correctly!
Ready to fetch all drug names!
We have 645 drugs to process
Estimated time: ~3-5 minutes (with rate limiting)


In [12]:

# STEP 4: Fetch Drug Names for All CIDs


from tqdm.notebook import tqdm
import json

print("\n[STEP 4] Fetching drug names from PubChem...")
print("⏱️  Estimated time: 3-5 minutes")
print("📊 Progress bar will show status\n")

# Get unique CIDs we need to map
unique_cids = list(cid_to_smiles.keys())
print(f"Total CIDs to process: {len(unique_cids):,}")

# Create mapping dictionary
cid_to_name = {}
failed_cids = []
success_count = 0
fail_count = 0

# Process with progress bar
for cid in tqdm(unique_cids, desc="Fetching drug names"):
    smiles = cid_to_smiles.get(cid)

    if smiles:
        name = get_drug_name_from_smiles(smiles)

        if name:
            cid_to_name[cid] = name
            success_count += 1
        else:
            failed_cids.append(cid)
            fail_count += 1

    # Rate limiting - be nice to PubChem servers
    time.sleep(0.3)

    # Save progress every 50 drugs (in case of interruption)
    if len(cid_to_name) % 50 == 0 and len(cid_to_name) > 0:
        with open('cid_to_name_progress.json', 'w') as f:
            json.dump(cid_to_name, f, indent=2)

print("FETCHING COMPLETE!")
print(f"✓ Successfully mapped: {success_count:,} CIDs ({success_count/len(unique_cids)*100:.1f}%)")
print(f"✗ Failed to map: {fail_count:,} CIDs ({fail_count/len(unique_cids)*100:.1f}%)")

if fail_count > 0:
    print(f"\nFailed CIDs (first 10):")
    for i, cid in enumerate(failed_cids[:10], 1):
        print(f"  {i}. {cid}")

# Show sample of successful mappings
print("SAMPLE MAPPINGS")
sample_mappings = list(cid_to_name.items())[:10]
for i, (cid, name) in enumerate(sample_mappings, 1):
    print(f"{i}. {cid} → {name}")

print("\n✓ All drug names fetched successfully!")


[STEP 4] Fetching drug names from PubChem...
⏱️  Estimated time: 3-5 minutes
📊 Progress bar will show status

Total CIDs to process: 645


Fetching drug names:   0%|          | 0/645 [00:00<?, ?it/s]

FETCHING COMPLETE!
✓ Successfully mapped: 619 CIDs (96.0%)
✗ Failed to map: 26 CIDs (4.0%)

Failed CIDs (first 10):
  1. CID000002520
  2. CID000002756
  3. CID000002891
  4. CID000002771
  5. CID000003203
  6. CID000004634
  7. CID000002187
  8. CID000004536
  9. CID000003902
  10. CID000005402
SAMPLE MAPPINGS
1. CID000002173 → 6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-oxo-4-thia-1-azabicyclo[3.2.0]heptane-2-carboxylate
2. CID000005206 → Sevoflurane
3. CID000003929 → rac-Linezolid
4. CID000001302 → 2-(6-Methoxy-2-naphthyl)propionic acid
5. CID000005267 → Lacalmin
6. CID000004601 → Orphenadrine
7. CID000005090 → Rofecoxib
8. CID000004946 → Propranolol
9. CID000005391 → Temazepam
10. CID000002802 → Clonazepam

✓ All drug names fetched successfully!


In [13]:
# STEP 5: Save the CID to Drug Name Mapping

print("\n[STEP 5] Saving CID to drug name mapping...")

# Save as DataFrame (CSV format)
cid_name_df = pd.DataFrame(list(cid_to_name.items()),
                            columns=['CID', 'Drug_Name'])
cid_name_df.to_csv('cid_to_drugname_mapping.csv', index=False)
print("✓ Saved to 'cid_to_drugname_mapping.csv'")

# Save as JSON for quick loading later
with open('cid_to_drugname_mapping.json', 'w') as f:
    json.dump(cid_to_name, f, indent=2)
print("✓ Saved to 'cid_to_drugname_mapping.json'")

# Save failed CIDs for reference
if failed_cids:
    failed_df = pd.DataFrame({'Failed_CID': failed_cids})
    failed_df.to_csv('failed_cids.csv', index=False)
    print(f"✓ Saved {len(failed_cids)} failed CIDs to 'failed_cids.csv'")

# Display the mapping DataFrame
print(f"\n✓ Total mappings saved: {len(cid_name_df):,}")
print("\nFirst 10 mappings in the file:")
print(cid_name_df.head(10))

print("\nLast 10 mappings in the file:")
print(cid_name_df.tail(10))

# Statistics
print("\n" + "="*70)
print("MAPPING STATISTICS")
print("="*70)
print(f"Total CIDs processed: {len(unique_cids):,}")
print(f"Successfully mapped: {len(cid_to_name):,} ({len(cid_to_name)/len(unique_cids)*100:.1f}%)")
print(f"Failed to map: {len(failed_cids):,} ({len(failed_cids)/len(unique_cids)*100:.1f}%)")

# Check drug name lengths (some might be very long chemical names)
name_lengths = cid_name_df['Drug_Name'].str.len()
print(f"\nDrug name lengths:")
print(f"  - Shortest: {name_lengths.min()} characters")
print(f"  - Longest: {name_lengths.max()} characters")
print(f"  - Average: {name_lengths.mean():.1f} characters")

# Find the longest name
longest_idx = name_lengths.idxmax()
print(f"\nLongest drug name:")
print(f"  CID: {cid_name_df.loc[longest_idx, 'CID']}")
print(f"  Name: {cid_name_df.loc[longest_idx, 'Drug_Name']}")

print("\n✓ Mapping files saved successfully!")
print("\nFiles created:")
print("  1. cid_to_drugname_mapping.csv")
print("  2. cid_to_drugname_mapping.json")
print("  3. failed_cids.csv")


[STEP 5] Saving CID to drug name mapping...
✓ Saved to 'cid_to_drugname_mapping.csv'
✓ Saved to 'cid_to_drugname_mapping.json'
✓ Saved 26 failed CIDs to 'failed_cids.csv'

✓ Total mappings saved: 619

First 10 mappings in the file:
            CID                                          Drug_Name
0  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...
1  CID000005206                                        Sevoflurane
2  CID000003929                                      rac-Linezolid
3  CID000001302             2-(6-Methoxy-2-naphthyl)propionic acid
4  CID000005267                                           Lacalmin
5  CID000004601                                       Orphenadrine
6  CID000005090                                          Rofecoxib
7  CID000004946                                        Propranolol
8  CID000005391                                          Temazepam
9  CID000002802                                         Clonazepam

Last 10 mappings in the file:

In [15]:
# STEP 6: Apply CID to Drug Name Mapping to DDI Dataset

print("\n[STEP 6] Applying drug name mappings to comprehensive_ddi_dataset...")

# Apply mapping to Drug1
print("\nMapping Drug1 names...")
comprehensive_ddi_dataset['Drug1_Name'] = comprehensive_ddi_dataset['Drug1_ID'].map(cid_to_name)
drug1_mapped = comprehensive_ddi_dataset['Drug1_Name'].notna().sum()
total_rows = len(comprehensive_ddi_dataset)
print(f"✓ Drug1 names mapped: {drug1_mapped:,} / {total_rows:,} ({drug1_mapped/total_rows*100:.1f}%)")

# Apply mapping to Drug2
print("\nMapping Drug2 names...")
comprehensive_ddi_dataset['Drug2_Name'] = comprehensive_ddi_dataset['Drug2_ID'].map(cid_to_name)
drug2_mapped = comprehensive_ddi_dataset['Drug2_Name'].notna().sum()
print(f"✓ Drug2 names mapped: {drug2_mapped:,} / {total_rows:,} ({drug2_mapped/total_rows*100:.1f}%)")

# Map side effects from Y labels
print("\nMapping side effect labels...")

# Load side effect labels (should already be loaded, but just in case)
with open("twosides_side_effect_labels.json", "r") as f:
    side_effect_labels = json.load(f)

# Convert keys to integers for proper mapping
side_effect_labels = {int(k): v for k, v in side_effect_labels.items()}

# Apply side effect mapping
comprehensive_ddi_dataset['SideEffect'] = comprehensive_ddi_dataset['Y'].map(side_effect_labels)
side_effects_mapped = comprehensive_ddi_dataset['SideEffect'].notna().sum()
print(f"✓ Side effects mapped: {side_effects_mapped:,} / {total_rows:,} ({side_effects_mapped/total_rows*100:.1f}%)")

# Calculate rows with complete information
complete_rows = comprehensive_ddi_dataset[
    comprehensive_ddi_dataset['Drug1_Name'].notna() &
    comprehensive_ddi_dataset['Drug2_Name'].notna() &
    comprehensive_ddi_dataset['SideEffect'].notna()
]

print("MAPPING RESULTS")
print(f"Total DDI records: {total_rows:,}")
print(f"Complete records (all fields mapped): {len(complete_rows):,}")
print(f"Percentage complete: {len(complete_rows)/total_rows*100:.1f}%")

# Show sample of mapped data
print("SAMPLE OF MAPPED DATA")
sample_cols = ['Drug1_ID', 'Drug1_Name', 'Drug2_ID', 'Drug2_Name', 'Y', 'SideEffect']
print("\nFirst 10 rows:")
print(comprehensive_ddi_dataset[sample_cols].head(10))

print("\nRandom sample of 10 rows:")
print(comprehensive_ddi_dataset[sample_cols].sample(10))

# Check for any remaining issues
print("DATA QUALITY CHECK")

# Rows with at least one missing drug name
missing_drug_names = comprehensive_ddi_dataset[
    comprehensive_ddi_dataset['Drug1_Name'].isna() |
    comprehensive_ddi_dataset['Drug2_Name'].isna()
]
print(f"Rows with missing drug name(s): {len(missing_drug_names):,} ({len(missing_drug_names)/total_rows*100:.1f}%)")

# Rows with missing side effect
missing_side_effects = comprehensive_ddi_dataset[comprehensive_ddi_dataset['SideEffect'].isna()]
print(f"Rows with missing side effect: {len(missing_side_effects):,} ({len(missing_side_effects)/total_rows*100:.1f}%)")

print("\n✓ Mapping complete!")


[STEP 6] Applying drug name mappings to comprehensive_ddi_dataset...

Mapping Drug1 names...
✓ Drug1 names mapped: 3,143,306 / 3,254,609 (96.6%)

Mapping Drug2 names...
✓ Drug2 names mapped: 3,162,036 / 3,254,609 (97.2%)

Mapping side effect labels...
✓ Side effects mapped: 3,254,609 / 3,254,609 (100.0%)
MAPPING RESULTS
Total DDI records: 3,254,609
Complete records (all fields mapped): 3,053,271
Percentage complete: 93.8%
SAMPLE OF MAPPED DATA

First 10 rows:
       Drug1_ID                                         Drug1_Name  \
0  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...   
1  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...   
2  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...   
3  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...   
4  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...   
5  CID000002173  6-{[Amino(phenyl)acetyl]amino}-3,3-dimethyl-7-...   
6  CID000002173  6-{[Amino(phenyl)acetyl]amin

In [16]:

# STEP 7: Save Complete and Clean Datasets

# Save full dataset with all mappings (including incomplete rows)
print("\n1. Saving FULL dataset (all rows)...")
comprehensive_ddi_dataset.to_csv('twosides_ddi_with_names_full.csv', index=False)
print(f"✓ Saved {len(comprehensive_ddi_dataset):,} rows to 'twosides_ddi_with_names_full.csv'")

# Save CLEAN dataset (only complete rows - ready for ML)
print("\n2. Saving CLEAN dataset (only complete rows)...")
clean_dataset = comprehensive_ddi_dataset[
    comprehensive_ddi_dataset['Drug1_Name'].notna() &
    comprehensive_ddi_dataset['Drug2_Name'].notna() &
    comprehensive_ddi_dataset['SideEffect'].notna()
].copy()

clean_dataset.to_csv('twosides_ddi_clean.csv', index=False)
print(f"✓ Saved {len(clean_dataset):,} rows to 'twosides_ddi_clean.csv'")
print(f"  This is your MAIN dataset for training!")

# Save a compact version (only essential columns for modeling)
print("\n3. Saving COMPACT dataset (for ML training)...")
compact_cols = ['Drug1_ID', 'Drug1_Name', 'Drug2_ID', 'Drug2_Name', 'Y', 'SideEffect']
clean_dataset[compact_cols].to_csv('twosides_ddi_compact.csv', index=False)
print(f"✓ Saved compact version to 'twosides_ddi_compact.csv'")

# Statistics about the clean dataset
print("CLEAN DATASET STATISTICS")

print(f"\nTotal records: {len(clean_dataset):,}")
print(f"\nUnique drugs:")
print(f"  - Drug1: {clean_dataset['Drug1_Name'].nunique():,}")
print(f"  - Drug2: {clean_dataset['Drug2_Name'].nunique():,}")
print(f"  - Total unique drugs: {pd.concat([clean_dataset['Drug1_Name'], clean_dataset['Drug2_Name']]).nunique():,}")

print(f"\nUnique side effects: {clean_dataset['SideEffect'].nunique():,}")

# Top 20 most common side effects
print("\nTop 20 Most Common Side Effects:")
top_20_side_effects = clean_dataset['SideEffect'].value_counts().head(20)
for idx, (side_effect, count) in enumerate(top_20_side_effects.items(), 1):
    percentage = (count / len(clean_dataset)) * 100
    print(f"  {idx:2d}. {side_effect:<40s} {count:>7,} ({percentage:>5.2f}%)")

# Save side effect distribution
print("\n4. Saving side effect distribution...")
side_effect_dist = clean_dataset['SideEffect'].value_counts().reset_index()
side_effect_dist.columns = ['SideEffect', 'Count']
side_effect_dist['Percentage'] = (side_effect_dist['Count'] / len(clean_dataset) * 100).round(2)
side_effect_dist.to_csv('side_effect_distribution.csv', index=False)
print(f"✓ Saved distribution of {len(side_effect_dist)} side effects to 'side_effect_distribution.csv'")

# Save drug frequency (for deployment autocomplete)
print("\n5. Saving drug frequency list...")
all_drugs = pd.concat([
    clean_dataset[['Drug1_ID', 'Drug1_Name']].rename(columns={'Drug1_ID': 'CID', 'Drug1_Name': 'Drug_Name'}),
    clean_dataset[['Drug2_ID', 'Drug2_Name']].rename(columns={'Drug2_ID': 'CID', 'Drug2_Name': 'Drug_Name'})
])
drug_frequency = all_drugs.groupby(['CID', 'Drug_Name']).size().reset_index(name='Frequency')
drug_frequency = drug_frequency.sort_values('Frequency', ascending=False)
drug_frequency.to_csv('drug_frequency_list.csv', index=False)
print(f"✓ Saved {len(drug_frequency):,} drugs to 'drug_frequency_list.csv'")
print("  (This will be useful for autocomplete in your app)")

# Summary of data distribution
print("DATA DISTRIBUTION ANALYSIS")

# Check class imbalance
max_count = clean_dataset['SideEffect'].value_counts().max()
min_count = clean_dataset['SideEffect'].value_counts().min()
imbalance_ratio = max_count / min_count

print(f"\nClass Imbalance:")
print(f"  - Most common side effect: {max_count:,} occurrences")
print(f"  - Least common side effect: {min_count:,} occurrences")
print(f"  - Imbalance ratio: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 100:
    print(f"  ⚠️  SEVERE class imbalance detected!")
    print(f"     Recommendation: Focus on top 50-100 side effects for modeling")

# Side effects by frequency category
side_effect_counts = clean_dataset['SideEffect'].value_counts()
very_rare = (side_effect_counts < 10).sum()
rare = ((side_effect_counts >= 10) & (side_effect_counts < 100)).sum()
uncommon = ((side_effect_counts >= 100) & (side_effect_counts < 1000)).sum()
common = ((side_effect_counts >= 1000) & (side_effect_counts < 10000)).sum()
very_common = (side_effect_counts >= 10000).sum()

print(f"\nSide Effect Frequency Distribution:")
print(f"  - Very Common (≥10,000): {very_common}")
print(f"  - Common (1,000-9,999): {common}")
print(f"  - Uncommon (100-999): {uncommon}")
print(f"  - Rare (10-99): {rare}")
print(f"  - Very Rare (<10): {very_rare}")

print("FILES CREATED")
print("✓ twosides_ddi_with_names_full.csv (all 3.2M rows)")
print("✓ twosides_ddi_clean.csv (3.05M complete rows) ← USE THIS FOR TRAINING")
print("✓ twosides_ddi_compact.csv (compact version)")
print("✓ side_effect_distribution.csv")
print("✓ drug_frequency_list.csv")
print("\n✓ All files saved successfully!")



1. Saving FULL dataset (all rows)...
✓ Saved 3,254,609 rows to 'twosides_ddi_with_names_full.csv'

2. Saving CLEAN dataset (only complete rows)...
✓ Saved 3,053,271 rows to 'twosides_ddi_clean.csv'
  This is your MAIN dataset for training!

3. Saving COMPACT dataset (for ML training)...
✓ Saved compact version to 'twosides_ddi_compact.csv'
CLEAN DATASET STATISTICS

Total records: 3,053,271

Unique drugs:
  - Drug1: 591
  - Drug2: 607
  - Total unique drugs: 619

Unique side effects: 1,317

Top 20 Most Common Side Effects:
   1. arterial pressure NOS decreased           18,674 ( 0.61%)
   2. anaemia                                   17,636 ( 0.58%)
   3. Difficulty breathing                      16,930 ( 0.55%)
   4. nausea                                    16,469 ( 0.54%)
   5. neumonia                                  16,225 ( 0.53%)
   6. Fatigue                                   15,779 ( 0.52%)
   7. diarrhea                                  15,602 ( 0.51%)
   8. Pain             