# The code has been optimized for easy viewing and understanding by claude.ai

In [1]:
# ============================================================
# Find Excluded Drug Pairs with Different Reaction Types
# ============================================================

import pandas as pd
from google.colab import files

# Step 1: Load the datasets
print("=" * 60)
print("Step 1: Loading datasets...")
print("=" * 60)

df_original = pd.read_csv('/content/DeepDDI-DrunkBunk_Original.csv')
df_processed = pd.read_csv('/content/DDI_unique_interactions.csv')

print(f"\nOriginal dataset: {len(df_original):,} rows")
print(f"Processed dataset: {len(df_processed):,} rows")
print(f"Removed rows: {len(df_original) - len(df_processed):,}")

# Step 2: Rename columns to match
print("\n" + "=" * 60)
print("Step 2: Standardizing column names...")
print("=" * 60)

# Rename original file columns to match processed file
df_original = df_original.rename(columns={
    'Drug1': 'Drug1_ID',
    'Drug2': 'Drug2_ID'
})

print("✓ Columns standardized")

# Set column names
drug1_col = 'Drug1_ID'
drug2_col = 'Drug2_ID'
type_col = 'Label'

# Step 3: Create drug pair identifier
print("\n" + "=" * 60)
print("Step 3: Finding excluded pairs...")
print("=" * 60)

def create_pair_key(row):
    pair = sorted([str(row[drug1_col]), str(row[drug2_col])])
    return f"{pair[0]}_{pair[1]}"

df_original['pair_key'] = df_original.apply(create_pair_key, axis=1)
df_processed['pair_key'] = df_processed.apply(create_pair_key, axis=1)

# Step 4: Find drug pairs with multiple reaction types
print("\nFinding drug pairs with multiple reaction types...")

pair_type_counts = df_original.groupby('pair_key')[type_col].nunique().reset_index()
pair_type_counts.columns = ['pair_key', 'num_types']

multi_type_pairs = pair_type_counts[pair_type_counts['num_types'] > 1]['pair_key'].tolist()

print(f"\n✓ Drug pairs with multiple reaction types: {len(multi_type_pairs)}")

# Step 5: Find excluded reactions
print("\n" + "=" * 60)
print("Step 4: Extracting excluded reactions...")
print("=" * 60)

df_original['row_key'] = df_original['pair_key'] + '_' + df_original[type_col].astype(str)
df_processed['row_key'] = df_processed['pair_key'] + '_' + df_processed[type_col].astype(str)

excluded_row_keys = set(df_original['row_key']) - set(df_processed['row_key'])
df_excluded = df_original[df_original['row_key'].isin(excluded_row_keys)].copy()

print(f"\n✓ Total excluded reactions: {len(df_excluded)}")

# Step 6: Create comparison table
print("\n" + "=" * 60)
print("Step 5: Creating comparison table...")
print("=" * 60)

comparison_data = []

for pair in multi_type_pairs:
    original_types = df_original[df_original['pair_key'] == pair][type_col].tolist()
    processed_types = df_processed[df_processed['pair_key'] == pair][type_col].tolist()

    pair_row = df_original[df_original['pair_key'] == pair].iloc[0]
    drug1 = pair_row[drug1_col]
    drug2 = pair_row[drug2_col]

    excluded_types = [t for t in original_types if t not in processed_types]

    for excluded_type in excluded_types:
        comparison_data.append({
            'Drug1_ID': drug1,
            'Drug2_ID': drug2,
            'Kept_Label': processed_types[0] if processed_types else 'None',
            'Excluded_Label': excluded_type,
            'All_Original_Labels': str(list(set(original_types)))
        })

df_comparison = pd.DataFrame(comparison_data)

print(f"\n✓ Comparison table: {len(df_comparison)} excluded reactions")

# Step 7: Display results
print("\n" + "=" * 60)
print("Step 6: Results - Excluded Reactions")
print("=" * 60)

print("\n Excluded reactions (can be used for validation):\n")
display(df_comparison)

# Step 8: Summary
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)

print(f"""
 Summary:
   ├── Original dataset:           {len(df_original):,} rows
   ├── Processed dataset:          {len(df_processed):,} rows
   ├── Total removed:              {len(df_original) - len(df_processed):,} rows
   ├── Pairs with multiple types:  {len(multi_type_pairs)}
   └── Excluded reactions:         {len(df_comparison)}
""")

print("\nDistribution of Excluded Labels:")
display(df_comparison['Excluded_Label'].value_counts())

# Step 9: Save and download
print("\n" + "=" * 60)
print("Saving results...")
print("=" * 60)

df_comparison.to_csv('/content/excluded_pairs_comparison.csv', index=False)
print("✓ Saved: excluded_pairs_comparison.csv")

files.download('/content/excluded_pairs_comparison.csv')

print("\n" + "=" * 60)
print("COMPLETE!")
print("=" * 60)
print("""
Use 'excluded_pairs_comparison.csv' for your Case Study section.
These are real DDI interactions your model never saw during training!
""")

Step 1: Loading datasets...

Original dataset: 192,283 rows
Processed dataset: 191,877 rows
Removed rows: 406

Step 2: Standardizing column names...
✓ Columns standardized

Step 3: Finding excluded pairs...

Finding drug pairs with multiple reaction types...

✓ Drug pairs with multiple reaction types: 406

Step 4: Extracting excluded reactions...

✓ Total excluded reactions: 406

Step 5: Creating comparison table...

✓ Comparison table: 406 excluded reactions

Step 6: Results - Excluded Reactions

 Excluded reactions (can be used for validation):



Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,All_Original_Labels
0,DB00541,DB00091,47,75,"[75, 47]"
1,DB00091,DB00997,57,73,"[73, 57]"
2,DB09027,DB00091,73,75,"[73, 75]"
3,DB00997,DB00169,47,73,"[73, 47]"
4,DB00541,DB00176,47,73,"[73, 47]"
...,...,...,...,...,...
401,DB06204,DB09241,49,60,"[49, 60]"
402,DB08899,DB06697,11,75,"[75, 11]"
403,DB09280,DB06697,11,75,"[75, 11]"
404,DB09280,DB06708,4,75,"[75, 4]"



Summary Statistics

 Summary:
   ├── Original dataset:           192,283 rows
   ├── Processed dataset:          191,877 rows
   ├── Total removed:              406 rows
   ├── Pairs with multiple types:  406
   └── Excluded reactions:         406


Distribution of Excluded Labels:


Unnamed: 0_level_0,count
Excluded_Label,Unnamed: 1_level_1
73,239
75,113
49,26
64,7
11,7
69,4
60,4
71,2
68,2
47,1



Saving results...
✓ Saved: excluded_pairs_comparison.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


COMPLETE!

Use 'excluded_pairs_comparison.csv' for your Case Study section.
These are real DDI interactions your model never saw during training!



In [3]:
# ============================================================
# Check Column Names for Each File
# ============================================================

import pandas as pd

# Load Original File
print("=" * 60)
print("FILE 1: DeepDDI-DrunkBunk_Original.csv")
print("=" * 60)

df_original = pd.read_csv('/content/DeepDDI-DrunkBunk_Original.csv')

print(f"\nRows: {len(df_original):,}")
print(f"\nColumn Names:")
for i, col in enumerate(df_original.columns.tolist()):
    print(f"   {i}: '{col}'")

print("\nSample data:")
display(df_original.head())

print("\n\n")

# Load Processed File
print("=" * 60)
print("FILE 2: DDI_unique_interactions.csv")
print("=" * 60)

df_processed = pd.read_csv('/content/DDI_unique_interactions.csv')

print(f"\nRows: {len(df_processed):,}")
print(f"\nColumn Names:")
for i, col in enumerate(df_processed.columns.tolist()):
    print(f"   {i}: '{col}'")

print("\nSample data:")
display(df_processed.head())

FILE 1: DeepDDI-DrunkBunk_Original.csv

Rows: 192,283

Column Names:
   0: 'Drug1'
   1: 'Drug2'
   2: 'Label'

Sample data:


Unnamed: 0,Drug1,Drug2,Label
0,DB04571,DB00460,1
1,DB00855,DB00460,1
2,DB09536,DB00460,1
3,DB01600,DB00460,1
4,DB09000,DB00460,1





FILE 2: DDI_unique_interactions.csv

Rows: 191,877

Column Names:
   0: 'Drug1_ID'
   1: 'Drug2_ID'
   2: 'Label'

Sample data:


Unnamed: 0,Drug1_ID,Drug2_ID,Label
0,DB00460,DB04571,1
1,DB00460,DB00855,1
2,DB00460,DB09536,1
3,DB00460,DB01600,1
4,DB00460,DB09000,1




# 'TOTAL Found in Top 3', this is important metric 👇👇

In [5]:
# ============================================================
# Validate Model Predictions on Excluded Reactions
# ============================================================

import pandas as pd
from google.colab import files

# Step 1: Load the files
print("=" * 60)
print("Step 1: Loading files...")
print("=" * 60)

df_excluded = pd.read_csv('/content/excluded_pairs_comparison.csv')
df_predictions = pd.read_csv('/content/test_predictions_top3.csv')  # Change filename if needed

print(f"\nExcluded pairs: {len(df_excluded)} rows")
print(f"Predictions: {len(df_predictions)} rows")

print("\nExcluded pairs sample:")
display(df_excluded.head())

print("\nPredictions sample:")
display(df_predictions.head())

# Step 2: Create pair keys for matching
print("\n" + "=" * 60)
print("Step 2: Creating pair keys for matching...")
print("=" * 60)

def create_pair_key(drug1, drug2):
    pair = sorted([str(drug1), str(drug2)])
    return f"{pair[0]}_{pair[1]}"

df_excluded['pair_key'] = df_excluded.apply(
    lambda x: create_pair_key(x['Drug1_ID'], x['Drug2_ID']), axis=1
)

df_predictions['pair_key'] = df_predictions.apply(
    lambda x: create_pair_key(x['Drug1_ID'], x['Drug2_ID']), axis=1
)
print("Pair keys created")

# Step 3: Merge excluded pairs with predictions
print("\n" + "=" * 60)
print("Step 3: Matching excluded pairs with predictions...")
print("=" * 60)

df_merged = df_excluded.merge(
    df_predictions,
    on='pair_key',
    how='inner',
    suffixes=('_excluded', '_pred')
)

print(f"\n✓ Matched pairs: {len(df_merged)}")

# Step 4: Check if excluded label appears in Top 3 predictions
print("\n" + "=" * 60)
print("Step 4: Checking if excluded labels appear in Top 3...")
print("=" * 60)

def check_excluded_in_top3(row):
    excluded_label = row['Excluded_Label']
    pred_1 = row['Predicted_Type_1']
    pred_2 = row['Predicted_Type_2']
    pred_3 = row['Predicted_Type_3']

    if excluded_label == pred_1:
        return 'Top1'
    elif excluded_label == pred_2:
        return 'Top2'
    elif excluded_label == pred_3:
        return 'Top3'
    else:
        return 'Not_Found'

df_merged['Excluded_Found_In'] = df_merged.apply(check_excluded_in_top3, axis=1)

# Step 5: Results Analysis
print("\n" + "=" * 60)
print("Step 5: Results Analysis")
print("=" * 60)

# Count results
found_counts = df_merged['Excluded_Found_In'].value_counts()
total = len(df_merged)

print("\nWhere Excluded Labels Were Found:\n")
print(found_counts)

# Calculate percentages
top1_count = found_counts.get('Top1', 0)
top2_count = found_counts.get('Top2', 0)
top3_count = found_counts.get('Top3', 0)
not_found_count = found_counts.get('Not_Found', 0)

in_top3_total = top1_count + top2_count + top3_count
######################################################
#  TOTAL Found in Top 3, this is important metric
####################################################
print(f"""
╔══════════════════════════════════════════════════════════╗
║                 VALIDATION RESULTS                       ║
╠══════════════════════════════════════════════════════════╣
║  Total excluded pairs matched:     {total:>6}                ║
╠══════════════════════════════════════════════════════════╣
║  Found in Top 1 (Primary):         {top1_count:>6}  ({top1_count/total*100:>5.2f}%)     ║
║  Found in Top 2 (Secondary):       {top2_count:>6}  ({top2_count/total*100:>5.2f}%)     ║
║  Found in Top 3 (Tertiary):        {top3_count:>6}  ({top3_count/total*100:>5.2f}%)     ║
╠══════════════════════════════════════════════════════════╣
║  ✓ TOTAL Found in Top 3:           {in_top3_total:>6}  ({in_top3_total/total*100:>5.2f}%)   ║
║  ✗ Not Found in Top 3:             {not_found_count:>6}  ({not_found_count/total*100:>5.2f}%)     ║
╚══════════════════════════════════════════════════════════╝
""")

# Step 6: Create detailed results table
print("\n" + "=" * 60)
print("Step 6: Detailed Results Table")
print("=" * 60)

# Select relevant columns for final table
df_results = df_merged[[
    'Drug1_ID_excluded', 'Drug2_ID_excluded',
    'Kept_Label', 'Excluded_Label',
    'Predicted_Type_1', 'Type_1_Score',
    'Predicted_Type_2', 'Type_2_Score',
    'Predicted_Type_3', 'Type_3_Score',
    'Excluded_Found_In'
]].copy()

# Rename columns for clarity
df_results.columns = [
    'Drug1_ID', 'Drug2_ID',
    'Kept_Label', 'Excluded_Label',
    'Pred_Top1', 'Score_Top1',
    'Pred_Top2', 'Score_Top2',
    'Pred_Top3', 'Score_Top3',
    'Excluded_Found_In'
]

print("\nDetailed Results:\n")
display(df_results)

# Step 7: Show successful predictions (excluded found in Top 3)
print("\n" + "=" * 60)
print("Step 7: Successful Predictions (Excluded Found in Top 3)")
print("=" * 60)

df_success = df_results[df_results['Excluded_Found_In'] != 'Not_Found']
print(f"\n✓ {len(df_success)} excluded labels were found in Top 3 predictions:\n")
display(df_success)

# Step 8: Summary by position
print("\n" + "=" * 60)
print("Step 8: Breakdown by Position")
print("=" * 60)

print("\nFound in Top 1 (Primary Prediction):")
df_top1 = df_results[df_results['Excluded_Found_In'] == 'Top1']
if len(df_top1) > 0:
    display(df_top1)
else:
    print("   None")

print("\nFound in Top 2 (Secondary Prediction):")
df_top2 = df_results[df_results['Excluded_Found_In'] == 'Top2']
if len(df_top2) > 0:
    display(df_top2)
else:
    print("   None")

print("\nFound in Top 3 (Tertiary Prediction):")
df_top3 = df_results[df_results['Excluded_Found_In'] == 'Top3']
if len(df_top3) > 0:
    display(df_top3)
else:
    print("   None")

# Step 9: Save results
print("\n" + "=" * 60)
print("Step 9: Saving results...")
print("=" * 60)

df_results.to_csv('/content/validation_excluded_labels.csv', index=False)
print("✓ Saved: validation_excluded_labels.csv")

df_success.to_csv('/content/successful_predictions_excluded.csv', index=False)
print("✓ Saved: successful_predictions_excluded.csv")

files.download('/content/validation_excluded_labels.csv')
files.download('/content/successful_predictions_excluded.csv')

print("\n" + "=" * 60)
print("COMPLETE!")
print("=" * 60)


Step 1: Loading files...

Excluded pairs: 406 rows
Predictions: 19200 rows

Excluded pairs sample:


Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,All_Original_Labels
0,DB00541,DB00091,47,75,"[75, 47]"
1,DB00091,DB00997,57,73,"[73, 57]"
2,DB09027,DB00091,73,75,"[73, 75]"
3,DB00997,DB00169,47,73,"[73, 47]"
4,DB00541,DB00176,47,73,"[73, 47]"



Predictions sample:


Unnamed: 0,Drug1_ID,Drug1_Name,Drug2_ID,Drug2_Name,Predicted_Type_1,Type_1_Score,Predicted_Type_2,Type_2_Score,Predicted_Type_3,Type_3_Score,True_Type_Index,True_Type_Name,Match_Top1,In_Top3,Type_1_Translation,Type_2_Translation,Type_3_Translation,True_Translation
0,DB00177,Valsartan,DB00266,Dicoumarol,47,0.835144,73,0.15063,49,0.004789,47,Type_47,Yes,Yes,The metabolism of Dicoumarol can be decreased ...,The serum concentration of Dicoumarol can be i...,The risk or severity of adverse effects can be...,The metabolism of Dicoumarol can be decreased ...
1,DB00270,Isradipine,DB00426,Famciclovir,47,0.981288,4,0.008613,73,0.008186,47,Type_47,Yes,Yes,The metabolism of Famciclovir can be decreased...,The metabolism of Famciclovir can be increased...,The serum concentration of Famciclovir can be ...,The metabolism of Famciclovir can be decreased...
2,DB00218,Moxifloxacin,DB01075,Diphenhydramine,20,0.998851,33,0.000559,82,0.000453,20,Type_20,Yes,Yes,Moxifloxacin may increase the QTc-prolonging a...,The risk or severity of QTc prolongation can b...,Moxifloxacin may increase the arrhythmogenic a...,Moxifloxacin may increase the QTc-prolonging a...
3,DB00655,Estrone,DB01250,Olsalazine,49,0.964823,73,0.017456,70,0.005702,49,Type_49,Yes,Yes,The risk or severity of adverse effects can be...,The serum concentration of Olsalazine can be i...,The therapeutic efficacy of Olsalazine can be ...,The risk or severity of adverse effects can be...
4,DB01590,Everolimus,DB08933,Luliconazole,73,0.992203,75,0.00707,49,0.000312,73,Type_73,Yes,Yes,The serum concentration of Luliconazole can be...,The serum concentration of Luliconazole can be...,The risk or severity of adverse effects can be...,The serum concentration of Luliconazole can be...



Step 2: Creating pair keys for matching...
Pair keys created

Step 3: Matching excluded pairs with predictions...

✓ Matched pairs: 38

Step 4: Checking if excluded labels appear in Top 3...

Step 5: Results Analysis

Where Excluded Labels Were Found:

Excluded_Found_In
Top2         22
Top1          9
Top3          5
Not_Found     2
Name: count, dtype: int64

╔══════════════════════════════════════════════════════════╗
║                 VALIDATION RESULTS                       ║
╠══════════════════════════════════════════════════════════╣
║  Total excluded pairs matched:         38                ║
╠══════════════════════════════════════════════════════════╣
║  Found in Top 1 (Primary):              9  (23.68%)     ║
║  Found in Top 2 (Secondary):           22  (57.89%)     ║
║  Found in Top 3 (Tertiary):             5  (13.16%)     ║
╠══════════════════════════════════════════════════════════╣
║  ✓ TOTAL Found in Top 3:               36  (94.74%)     ║
║  ✗ Not Found in Top 3:       

Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,Pred_Top1,Score_Top1,Pred_Top2,Score_Top2,Pred_Top3,Score_Top3,Excluded_Found_In
0,DB00182,DB00956,49,69,69,0.929948,49,0.068907,41,0.000757,Top1
1,DB00238,DB00997,4,73,4,0.696234,47,0.247736,73,0.053151,Top3
2,DB00312,DB00243,4,75,4,0.59592,47,0.148596,73,0.143834,Not_Found
3,DB01201,DB00243,4,75,4,0.696462,75,0.204739,73,0.074771,Top2
4,DB00252,DB06697,11,73,75,0.548079,73,0.217317,4,0.111382,Top2
5,DB00252,DB09123,70,75,70,0.924841,75,0.061503,4,0.010662,Top2
6,DB00541,DB00285,47,73,73,0.587779,47,0.365197,11,0.02295,Top1
7,DB00541,DB00305,49,73,73,0.889574,49,0.080061,47,0.015753,Top1
8,DB00312,DB00541,4,75,4,0.532205,47,0.182958,75,0.131363,Top3
9,DB00327,DB00752,47,49,49,0.740517,8,0.127832,41,0.047255,Top1



Step 7: Successful Predictions (Excluded Found in Top 3)

✓ 36 excluded labels were found in Top 3 predictions:



Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,Pred_Top1,Score_Top1,Pred_Top2,Score_Top2,Pred_Top3,Score_Top3,Excluded_Found_In
0,DB00182,DB00956,49,69,69,0.929948,49,0.068907,41,0.000757,Top1
1,DB00238,DB00997,4,73,4,0.696234,47,0.247736,73,0.053151,Top3
3,DB01201,DB00243,4,75,4,0.696462,75,0.204739,73,0.074771,Top2
4,DB00252,DB06697,11,73,75,0.548079,73,0.217317,4,0.111382,Top2
5,DB00252,DB09123,70,75,70,0.924841,75,0.061503,4,0.010662,Top2
6,DB00541,DB00285,47,73,73,0.587779,47,0.365197,11,0.02295,Top1
7,DB00541,DB00305,49,73,73,0.889574,49,0.080061,47,0.015753,Top1
8,DB00312,DB00541,4,75,4,0.532205,47,0.182958,75,0.131363,Top3
9,DB00327,DB00752,47,49,49,0.740517,8,0.127832,41,0.047255,Top1
10,DB00997,DB00328,72,73,72,0.976405,73,0.010495,49,0.00939,Top2



Step 8: Breakdown by Position

Found in Top 1 (Primary Prediction):


Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,Pred_Top1,Score_Top1,Pred_Top2,Score_Top2,Pred_Top3,Score_Top3,Excluded_Found_In
0,DB00182,DB00956,49,69,69,0.929948,49,0.068907,41,0.000757,Top1
6,DB00541,DB00285,47,73,73,0.587779,47,0.365197,11,0.02295,Top1
7,DB00541,DB00305,49,73,73,0.889574,49,0.080061,47,0.015753,Top1
9,DB00327,DB00752,47,49,49,0.740517,8,0.127832,41,0.047255,Top1
16,DB00541,DB01232,47,73,73,0.522243,47,0.390057,75,0.045219,Top1
17,DB00541,DB08873,47,73,73,0.617976,47,0.301613,75,0.045055,Top1
32,DB01229,DB01072,47,73,73,0.609677,47,0.296273,49,0.066745,Top1
35,DB01224,DB01233,20,49,49,0.501783,20,0.465319,82,0.017977,Top1
37,DB09280,DB06708,4,75,75,0.706389,73,0.245007,4,0.042275,Top1



Found in Top 2 (Secondary Prediction):


Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,Pred_Top1,Score_Top1,Pred_Top2,Score_Top2,Pred_Top3,Score_Top3,Excluded_Found_In
3,DB01201,DB00243,4,75,4,0.696462,75,0.204739,73,0.074771,Top2
4,DB00252,DB06697,11,73,75,0.548079,73,0.217317,4,0.111382,Top2
5,DB00252,DB09123,70,75,70,0.924841,75,0.061503,4,0.010662,Top2
10,DB00997,DB00328,72,73,72,0.976405,73,0.010495,49,0.00939,Top2
11,DB00420,DB01233,20,49,20,0.687767,49,0.247326,47,0.025494,Top2
12,DB00502,DB01233,20,49,20,0.651354,49,0.295812,47,0.031929,Top2
14,DB00794,DB00541,4,75,4,0.540584,75,0.365346,73,0.069971,Top2
15,DB01229,DB00541,49,73,49,0.905301,73,0.058211,47,0.025525,Top2
18,DB00564,DB01115,4,75,4,0.693713,75,0.213188,47,0.032707,Top2
19,DB00613,DB01685,47,73,47,0.996369,73,0.00358,4,1.8e-05,Top2



Found in Top 3 (Tertiary Prediction):


Unnamed: 0,Drug1_ID,Drug2_ID,Kept_Label,Excluded_Label,Pred_Top1,Score_Top1,Pred_Top2,Score_Top2,Pred_Top3,Score_Top3,Excluded_Found_In
1,DB00238,DB00997,4,73,4,0.696234,47,0.247736,73,0.053151,Top3
8,DB00312,DB00541,4,75,4,0.532205,47,0.182958,75,0.131363,Top3
13,DB00531,DB00997,15,73,15,0.943557,49,0.054773,73,0.001056,Top3
26,DB09241,DB00956,49,60,49,0.778765,41,0.15933,60,0.050542,Top3
33,DB01174,DB01115,4,75,4,0.672874,60,0.191292,75,0.086014,Top3



Step 9: Saving results...
✓ Saved: validation_excluded_labels.csv
✓ Saved: successful_predictions_excluded.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


COMPLETE!


In [6]:
# ============================================================
# Verify Data Cleaning - Check for Other Issues
# ============================================================

import pandas as pd

# Load original data
df_original = pd.read_csv('/content/DeepDDI-DrunkBunk_Original.csv')
df_processed = pd.read_csv('/content/DDI_unique_interactions.csv')

# Rename columns to match
df_original = df_original.rename(columns={'Drug1': 'Drug1_ID', 'Drug2': 'Drug2_ID'})

print("=" * 60)
print("DATA QUALITY CHECK")
print("=" * 60)

# 1. Check for exact duplicates in original
exact_duplicates = df_original.duplicated().sum()
print(f"\n1. Exact duplicate rows in original: {exact_duplicates}")

# 2. Create sorted pair key to check inverse duplicates
def create_pair_key(row):
    pair = sorted([str(row['Drug1_ID']), str(row['Drug2_ID'])])
    return f"{pair[0]}_{pair[1]}"

df_original['pair_key'] = df_original.apply(create_pair_key, axis=1)
df_original['pair_label_key'] = df_original['pair_key'] + '_' + df_original['Label'].astype(str)

# 3. Check inverse duplicates (A+B and B+A with same label)
inverse_duplicates = df_original['pair_label_key'].duplicated().sum()
print(f"2. Inverse duplicates (A+B = B+A, same label): {inverse_duplicates}")

# 4. Count pairs with multiple labels
pair_label_counts = df_original.groupby('pair_key')['Label'].nunique()
multi_label_pairs = (pair_label_counts > 1).sum()
print(f"3. Pairs with multiple labels: {multi_label_pairs}")

# 5. Count total extra rows from multi-label pairs
total_multi_label_rows = df_original[df_original['pair_key'].isin(
    pair_label_counts[pair_label_counts > 1].index
)].shape[0]

rows_to_remove = total_multi_label_rows - multi_label_pairs
print(f"4. Extra rows from multi-label pairs: {rows_to_remove}")

# 6. Summary
print(f"\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"""
Original rows:                    {len(df_original):,}
Processed rows:                   {len(df_processed):,}
Difference:                       {len(df_original) - len(df_processed)}

Rows removed due to:
  - Exact duplicates:             {exact_duplicates}
  - Inverse duplicates:           {inverse_duplicates}
  - Multiple labels (secondary):  {rows_to_remove}

TOTAL explained:                  {exact_duplicates + inverse_duplicates + rows_to_remove}
""")

if (len(df_original) - len(df_processed)) == rows_to_remove:
    print("✅ CONFIRMED: All removed rows are due to multiple reaction types only!")
    print("✅ Your original data was clean!")
else:
    print("⚠️ There may be other factors - check the numbers above")

DATA QUALITY CHECK

1. Exact duplicate rows in original: 0
2. Inverse duplicates (A+B = B+A, same label): 0
3. Pairs with multiple labels: 406
4. Extra rows from multi-label pairs: 406

SUMMARY

Original rows:                    192,283
Processed rows:                   191,877
Difference:                       406

Rows removed due to:
  - Exact duplicates:             0
  - Inverse duplicates:           0
  - Multiple labels (secondary):  406

TOTAL explained:                  406

✅ CONFIRMED: All removed rows are due to multiple reaction types only!
✅ Your original data was clean!
