# **DATA PREPARATION**

In [None]:
import pandas as pd
import numpy as np

In [None]:
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/Giessen_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [None]:
with zipfile.ZipFile('/content/pan_genome_sequences.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/public')

## **Process Giessen Dataset**

In [None]:
#load data
data = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_multi_data.csv") #809 samples
pheno = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/giessen/cip_ctx_ctz_gen_pheno.csv")   # 809 labels

print(f"Giessen samples: {len(data)}")
print(f"Features: {data.shape[1]}")
print(f"Phenotype distribution:\n{pheno.value_counts()}")

#verify against paper's Table 1
paper_samples = {"CIP": 900, "CTX": 930, "CTZ": 841, "GEN": 926}

our_samples = {
    "CIP": pheno['CIP'].notna().sum(),
    "CTX": pheno['CTX'].notna().sum(),
    "CTZ": pheno['CTZ'].notna().sum(),
    "GEN": pheno['GEN'].notna().sum()
}


#document discrepancy
with open("/content/drive/MyDrive/ML-iAMR_Recreation/00_project_documentation/data_discrepancy.md", "w") as f:
    f.write(f"# Data Availability Report\n\n")
    f.write(f"Paper reported: {paper_samples}\n")
    f.write(f"We obtained: {our_samples}\n")
    f.write(f"Coverage: {our_samples['CIP']/paper_samples['CIP']*100:.1f}%\n")

Giessen samples: 809
Features: 60937
Phenotype distribution:
prename             CIP  CTX  CTZ  GEN
wwRBL-17-004-1_S44  1    1    1    1      1
H100_S2_L001        0    1    0    0      1
H105_S3_L001        1    1    0    0      1
H108_S5_L001        0    1    0    0      1
H109_S2_L001        0    1    0    0      1
                                         ..
H153_S1_L001        1    1    1    1      1
H148_S4_L001        1    1    1    1      1
H141_S6_L001        0    1    1    0      1
H136_S7_L001        1    1    0    0      1
H130_S2_L001        0    1    1    1      1
Name: count, Length: 809, dtype: int64


In [None]:
#extract phenotypes for 4 antibiotics
antibiotics = ['CIP', 'CTX', 'CTZ', 'GEN']
for ab in antibiotics:
    resistant = (pheno[ab] == 1).sum() #assuming 1 represents Resistant
    susceptible = (pheno[ab] == 0).sum() #assuming 0 represents Susceptible
    print(f"{ab}: {resistant}R / {susceptible}S")

CIP: 366R / 443S
CTX: 358R / 451S
CTZ: 276R / 533S
GEN: 188R / 621S


## **Process Public Dataset**

In [None]:
#from gene_presence_absence.csv
pan_genome = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/public/gene_presence_absence.csv")
print(f"Genes: {len(pan_genome)}")  #should be 44,958 as per paper
print(f"Samples: {pan_genome.shape[1] - 14}")  #exclude metadata columns

#from S1_Table of Public Data Paper (ENA metadata)
metadata = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/public/pcbi.1006258.s010 (S1_Table_ENA_metadata).csv")
print(f"Total samples: {len(metadata)}")

#extract phenotypes for 4 antibiotics
antibiotics = ['CIP', 'CTX', 'CTZ', 'GEN']
for ab in antibiotics:
    resistant = (metadata[ab] == 'R').sum()
    susceptible = (metadata[ab] == 'S').sum()
    print(f"{ab}: {resistant}R / {susceptible}S")

  pan_genome = pd.read_csv("/content/drive/MyDrive/ML-iAMR_Recreation/01_data/raw/public/gene_presence_absence.csv")


Genes: 44957
Samples: 1094
Total samples: 1936
CIP: 429R / 1483S
CTX: 383R / 1472S
CTZ: 265R / 1593S
GEN: 284R / 1629S


- We Got `809 Samples` while the `paper used 987` samples, so we `may observe a 1 or 2 % decrease in performance`.