<a href="https://colab.research.google.com/github/MrEktirir/Antibiotics-Resistance-Prediction/blob/feature%2Fdata-preparation/Notebooks/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

from google.colab import drive
drive.mount('/content/drive')

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
# RTAB dosya yolları
rtab_files = {
    "azm_sr": "/content/drive/MyDrive/Dataset/azm_sr_gwas_filtered_unitigs.Rtab",
    "cfx_sr": "/content/drive/MyDrive/Dataset/cfx_sr_gwas_filtered_unitigs.Rtab",
    "cip_sr": "/content/drive/MyDrive/Dataset/cip_sr_gwas_filtered_unitigs.Rtab",
}

# Metadata'yı oku
metadata_df = pd.read_csv("/content/drive/MyDrive/Dataset/metadata.csv")

# Başlangıç: metadata ile
full_df = metadata_df.copy()

# Her RTAB dosyasını işleyip birleşik DataFrame'e ekle
for label, filepath in rtab_files.items():
    print(f"🔄 {label} dosyası işleniyor...")

    # RTAB oku
    rtab_df = pd.read_csv(filepath, sep=r"\s+", header=None)
    rtab_df.columns = ["unitig"] + list(rtab_df.iloc[0, 1:])
    rtab_df = rtab_df[1:].reset_index(drop=True)

    # Transpoz
    rtab_df.set_index("unitig", inplace=True)
    unitig_matrix = rtab_df.T
    unitig_matrix.index.name = "Sample_ID"

    # Sütun adlarını benzersiz hale getir (örnek: azm_ABC123)
    unitig_matrix.columns = [f"{label}_{col}" for col in unitig_matrix.columns]

    # Metadata ile Sample_ID üzerinden birleştir
    full_df = full_df.merge(unitig_matrix, on="Sample_ID", how="left")

    print(f"✅ {label} unitig'leri eklendi → full_df.shape: {full_df.shape}")

# Kontrol: hedef sütunlar mevcut mu
print("\n🎯 Birleşmiş verisetinde mevcut sütunlar:", full_df.columns.tolist()[:15], "...")

# Özellikler ve hedef ayrımı
target_cols = list(rtab_files.keys())  # ['azm_sr', 'cfx_sr', 'cip_sr']
X = full_df.drop(columns=target_cols)
y = full_df[target_cols]

print(f"\n✅ Nihai veri hazır! X: {X.shape}, y: {y.shape}")


🔄 azm_sr dosyası işleniyor...


  rtab_df = pd.read_csv(filepath, sep=r"\s+", header=None)


✅ azm_sr unitig'leri eklendi → full_df.shape: (3786, 546)
🔄 cfx_sr dosyası işleniyor...


  rtab_df = pd.read_csv(filepath, sep=r"\s+", header=None)


✅ cfx_sr unitig'leri eklendi → full_df.shape: (3786, 930)
🔄 cip_sr dosyası işleniyor...


  rtab_df = pd.read_csv(filepath, sep=r"\s+", header=None)


✅ cip_sr unitig'leri eklendi → full_df.shape: (3786, 9803)

🎯 Birleşmiş verisetinde mevcut sütunlar: ['Sample_ID', 'Year', 'Country', 'Continent', 'Beta.lactamase', 'Azithromycin', 'Ciprofloxacin', 'Ceftriaxone', 'Cefixime', 'Tetracycline', 'Penicillin', 'NG_MAST', 'Group', 'azm_mic', 'cip_mic'] ...

✅ Nihai veri hazır! X: (3786, 9800), y: (3786, 3)


In [26]:
dataset = full_df.copy()

Index(['Sample_ID', 'Year', 'Country', 'Continent', 'Beta.lactamase',
       'Azithromycin', 'Ciprofloxacin', 'Ceftriaxone', 'Cefixime',
       'Tetracycline',
       ...
       'cip_sr_CGCCTATGTGCTGCAACTTTTGGACACCGCGCTCGC',
       'cip_sr_GGAATCAGGACGATAAACAAAATGCTGCCGGCGAG',
       'cip_sr_CGCTACATAAGGAGAACCCTAAAATGCCGCA',
       'cip_sr_ACATCCTGCCATCCGACAAGCGGCGGCAACCGCCAAAAAC',
       'cip_sr_TCTTGGGCGCCGTTTGTGGGTTTGTTTATCGCGCG',
       'cip_sr_AAGCAGCGCACGCTGTGAATCACATCTTTCGCC',
       'cip_sr_ACCGCGCCGGACAGGGTGCGCGTAAACGGCAGTTG',
       'cip_sr_AACGACAATACACACACCCTTCCCCCGCGCC',
       'cip_sr_AAATTCATCAAAGAGCAAAGCGATTGGAAACCTCTGCCTGCTAATGCCGGTGAGTTGACCC',
       'cip_sr_GCCGCCTTGTCCTGATTTTTGTTAATCCGCCA,AACCATAAGTCGGCAACTTCGTAAACCAGGTGGGACGGGTCTTTGTCTTTGGATGC,AGAAGCTCGGAATCGTTCAGCAATACGCCTGCG'],
      dtype='object', length=9803)

In [28]:
dataset.head(10)

Unnamed: 0,Sample_ID,Year,Country,Continent,Beta.lactamase,Azithromycin,Ciprofloxacin,Ceftriaxone,Cefixime,Tetracycline,...,cip_sr_CGCCTATGTGCTGCAACTTTTGGACACCGCGCTCGC,cip_sr_GGAATCAGGACGATAAACAAAATGCTGCCGGCGAG,cip_sr_CGCTACATAAGGAGAACCCTAAAATGCCGCA,cip_sr_ACATCCTGCCATCCGACAAGCGGCGGCAACCGCCAAAAAC,cip_sr_TCTTGGGCGCCGTTTGTGGGTTTGTTTATCGCGCG,cip_sr_AAGCAGCGCACGCTGTGAATCACATCTTTCGCC,cip_sr_ACCGCGCCGGACAGGGTGCGCGTAAACGGCAGTTG,cip_sr_AACGACAATACACACACCCTTCCCCCGCGCC,cip_sr_AAATTCATCAAAGAGCAAAGCGATTGGAAACCTCTGCCTGCTAATGCCGGTGAGTTGACCC,"cip_sr_GCCGCCTTGTCCTGATTTTTGTTAATCCGCCA,AACCATAAGTCGGCAACTTCGTAAACCAGGTGGGACGGGTCTTTGTCTTTGGATGC,AGAAGCTCGGAATCGTTCAGCAATACGCCTGCG"
0,ERR1549286,2015.0,UK,Europe,,>256,,0.016,,,...,1,1,1,1,1,1,1,1,1,1
1,ERR1549290,2015.0,UK,Europe,,>256,,0.004,,,...,1,1,1,1,1,1,1,1,1,1
2,ERR1549291,2015.0,UK,Europe,,>256,,0.006,,,...,1,1,1,1,1,1,1,1,1,1
3,ERR1549287,2015.0,UK,Europe,,>256,,0.006,,,...,1,1,1,1,1,1,1,1,1,1
4,ERR1549288,2015.0,UK,Europe,,>256,,0.008,,,...,1,1,1,1,1,1,1,1,1,1
5,ERR1549299,2015.0,UK,Europe,,>256,,0.012,,,...,1,1,1,1,1,1,1,1,1,1
6,ERR1549292,2015.0,UK,Europe,,>256,,0.023,,,...,1,1,1,1,1,1,1,1,1,1
7,ERR1549298,2015.0,UK,Europe,,0.5,,0.094,,,...,1,1,1,1,1,1,1,1,1,1
8,ERR1549296,2015.0,UK,Europe,,0.5,,0.094,,,...,1,1,1,1,1,1,1,1,1,1
9,ERR1549300,2015.0,UK,Europe,,>256,,0.008,,,...,1,1,1,1,1,1,1,1,1,1


In [27]:
for col in dataset.columns:
    print(col)

Sample_ID
Year
Country
Continent
Beta.lactamase
Azithromycin
Ciprofloxacin
Ceftriaxone
Cefixime
Tetracycline
Penicillin
NG_MAST
Group
azm_mic
cip_mic
cro_mic
cfx_mic
tet_mic
pen_mic
log2_azm_mic
log2_cip_mic
log2_cro_mic
log2_cfx_mic
log2_tet_mic
log2_pen_mic
azm_sr
cip_sr
cro_sr
cfx_sr
tet_sr
pen_sr
azm_sr_CTTAACATATTTGCCTTTGATTTTTGAAGAAGCTGCCACGCCGGCAG
azm_sr_TACCGTAACCGGCAATGCGGATATTACGGTC
azm_sr_CAGACGGCATTTTTTTTGCGTTTTTCGGGAGG
azm_sr_AACGGGTTTTCAGACGGCATTCGATATCGGGACG
azm_sr_CCAAAAATTACCCGCGTTGACGTAGCTAAAGA
azm_sr_CGGACCGGTATTCCGTCGAAATCACCGCCGTCAACCGCCCC
azm_sr_TGAAATTGTCCATCTCGTATGCCGTCTTCTGCTTG
azm_sr_TACGGTATTGTCCGCATTATTAAACTCAAAACC,AGAAGACGGCATACGAGATGGACAATTTCATCC
azm_sr_GGCATTTTTTTTGCGTTTTTCGGGAGGGGGCGGC
azm_sr_TATATAAGGGGTTGCCGTTCCGCAGTTGGGCGGCAGCATAC
azm_sr_TGGTAATGCCGGGTGAGAACGTAACCATTACTGTAGAACTGATTGCG
azm_sr_ACGCTTTGAACATATTTGCCTTTGATTTCGG
azm_sr_TTATGAACAAACCATTGGTGAATCAGGCTGCTATGGT
azm_sr_ACGGCGACGGCAGCGGCGACGGCGACGGCAACGGCA
azm_sr_CGCATGGGCAAGCAGGTCGAGATATTCGCCG
az