In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import os

In [None]:

def process_drug_kmer(input_path, output_dir, k_values=[3, 6, 9, 12, 15]):

    print(f"Reading data from: {input_path}")

    # Read the CSV file
    df = pd.read_csv(input_path)

    # Auto-detect column names
    print("Available columns:", df.columns.tolist())

    # Find Drug ID column
    drug_id_col = None
    for col in df.columns:
        if any(keyword in col.lower() for keyword in ['drug', 'id']):
            drug_id_col = col
            break

    if not drug_id_col:
        drug_id_col = df.columns[0]  # Use first column as fallback

    # Find SMILES column
    smile_col = None
    for col in df.columns:
        if 'smile' in col.lower():
            smile_col = col
            break

    if not smile_col:
        smile_col = df.columns[1]  # Use second column as fallback

    print(f"Using columns: Drug ID = '{drug_id_col}', SMILE = '{smile_col}'")
    print(f"Dataset contains {len(df)} drugs")

    # Create output directory
    os.makedirs(output_dir, exist_ok=False)


    #=====================================
    #
    #  Implementing the function K-mer
    #
    #=====================================
    # Process for each k value

    for k in k_values:
        print(f"\n{'='*50}")
        print(f"PROCESSING WITH K={k}")
        print(f"{'='*50}")

        # Prepare results list
        results = []
        all_kmers = []  # Collect all k-mers to calculate unique count

        # Process each drug
        for idx, row in df.iterrows():
            drug_id = row[drug_id_col]
            smile = row[smile_col]

            if pd.isna(smile) or not smile or len(smile) < k:
                # Handle empty or short SMILES
                results.append({
                    'Drug_ID': drug_id,
                    'Segmented_SMILE': ''
                })
                continue


            # Generate k-mers for this SMILES
            kmers_list = []
            smile_length = len(smile)

            # Extract k-mers using sliding window
            for i in range(smile_length - k + 1):
                kmer = smile[i:i + k]
                kmers_list.append(kmer)
                all_kmers.append(kmer)

            # Join k-mers with semicolons
            segmented_smile = ';'.join(kmers_list)

            # Add to results
            results.append({
                'Drug_ID': drug_id,
                'Segmented_SMILE': segmented_smile
            })

        # Create DataFrame and save with proper quoting
        results_df = pd.DataFrame(results)
        output_file = f'{output_dir}/drugbank_kmers_k{k}.csv'
        results_df.to_csv(output_file, index=False, quoting=1)  # QUOTE_ALL

        print(f"Results saved to: {output_file}")
        print(f"  Total drugs processed: {len(results_df)}")

        # Display sample results
        if len(results_df) > 0:
            print(f"\nSample of first 3 results:")
            for i in range(min(3, len(results_df))):
                row = results_df.iloc[i]
                drug_id = row['Drug_ID']
                segmented = row['Segmented_SMILE'].strip('"') if row['Segmented_SMILE'] else ''

                if segmented:
                    kmers_count = len(segmented.split(';'))
                    # Show first few k-mers
                    sample_kmers = ';'.join(segmented.split(';')[:5])
                    if kmers_count > 5:
                        sample_kmers += '...'
                    print(f"  {drug_id}: {kmers_count} k-mers")
                    print(f"    Sample: {sample_kmers}")
                else:
                    print(f"  {drug_id}: No k-mers (SMILE too short or empty)")

        # Calculate unique k-mers count
        unique_kmers_count = len(set(all_kmers)) if all_kmers else 0

        # Display comprehensive statistics
        # Calculate k-mers per drug statistics
        kmers_per_drug = []
        for _, row in results_df.iterrows():
            segmented = row['Segmented_SMILE'].strip('"') if row['Segmented_SMILE'] else ''
            if segmented:
                kmers_count = len(segmented.split(';'))
                kmers_per_drug.append(kmers_count)
            else:
                kmers_per_drug.append(0)

        avg_kmers = np.mean(kmers_per_drug) if kmers_per_drug else 0
        min_kmers = min(kmers_per_drug) if kmers_per_drug else 0
        max_kmers = max(kmers_per_drug) if kmers_per_drug else 0

        print(f"\nStatistics for k={k}:")
        print(f"  Total unique k-mers: {unique_kmers_count}")
        print(f"  Average k-mers per drug: {avg_kmers:.2f}")
        print(f"  Min k-mers per drug: {min_kmers}")
        print(f"  Max k-mers per drug: {max_kmers}")

    print(f"\n All processing completed!")
    print(f"Results saved in: {output_dir}")
    print("\nGenerated files:")
    for k in k_values:
        print(f"  - drugbank_kmers_k{k}.csv")



In [None]:
# Example usage
if __name__ == "__main__":
    # Update these paths according to your setup
    input_file = '/content/drive/MyDrive/MLHygnn/DB/Drugs_With_SMILES.csv' # can change to your link
    output_dir = '/content/drive/MyDrive/MLHygnn/DB/kmer_results_simple1709drugs'

    # Process with different k values
    k_values = [3, 6, 9, 12, 15]

    try:
        process_drug_kmer(input_file, output_dir, k_values)
        print("\n‚úÖ Processing completed successfully!")

    except FileNotFoundError as e:
        print(f"\n‚ùå Error: Could not find the input file.")
        print(f"Please ensure the file exists at: {input_file}")

    except Exception as e:
        print(f"\n‚ùå An error occurred: {e}")
        print("Please check your file paths and data format.")

Reading data from: /content/drive/MyDrive/MLHygnn/DB/Unique_drugs_with_smiles1709drug.csv
Available columns: ['DrugBank_ID', 'SMILES']
Using columns: Drug ID = 'DrugBank_ID', SMILE = 'SMILES'
Dataset contains 1709 drugs

PROCESSING WITH K=3
‚úÖ Results saved to: /content/drive/MyDrive/MLHygnn/DB/kmer_results_simple1709drugs/drugbank_kmers_k3.csv
  Total drugs processed: 1709

Sample of first 3 results:
  DB00006: 355 k-mers
    Sample: CC[;C[C;[C@;C@H;@H]...
  DB00014: 207 k-mers
    Sample: CC(;C(C;(C);C)C;)C[...
  DB00027: 306 k-mers
    Sample: CC(;C(C;(C);C)C;)C[...

Statistics for k=3:
  Total unique k-mers: 1298
  Average k-mers per drug: 62.51
  Min k-mers per drug: 1
  Max k-mers per drug: 1173

PROCESSING WITH K=6
‚úÖ Results saved to: /content/drive/MyDrive/MLHygnn/DB/kmer_results_simple1709drugs/drugbank_kmers_k6.csv
  Total drugs processed: 1709

Sample of first 3 results:
  DB00006: 352 k-mers
    Sample: CC[C@H;C[C@H];[C@H](;C@H](C;@H](C)...
  DB00014: 204 k-mers
    Samp