In [9]:
### Script to convert multiple FASTA files into CSV format with each base in its own column
from Bio import SeqIO
import pandas as pd
import glob
import os

# Folder containing your fasta files
fasta_folder = r"C:\Users\Noah Legall\Downloads\fasta_files"
output_folder = r"C:\Users\Noah Legall\Downloads\fasta_files\output_csv"

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get all fasta files in the folder
fasta_files = glob.glob(os.path.join(fasta_folder, "*.fasta"))

for fasta_file in fasta_files:
    gene_name = os.path.splitext(os.path.basename(fasta_file))[0]  # e.g., geneA
    rows = []

    # Parse all sequences in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        seq = str(record.seq)
        
        # Create column names (geneA_1, geneA_2, etc.)
        cols = [f"{gene_name}_{i+1}" for i in range(len(seq))]
        
        # Create a row dictionary mapping each position to a base
        row_data = dict(zip(cols, seq))
        row_data["sequence_id"] = record.id  # keep track of sequence ID
        
        rows.append(row_data)

    # Convert to DataFrame
    df = pd.DataFrame(rows)
    
    # Output file path
    output_path = os.path.join(output_folder, f"{gene_name}.csv")
    
    # Save CSV
    df.to_csv(output_path, index=False)
    print(f"âœ… Saved {output_path}")


SystemError: <class 'numpy.iinfo'> returned a result with an exception set

In [12]:
### use the master resistance table to find the resistant and susceptible isolates
import pandas as pd 
master_df = pd.read_csv(r"C:\Users\Noah Legall\LegallLab\SHAP-mTB-AMR\resistance_dataset\master_table_resistance.csv")
rifampicin_full = master_df[(master_df['RIFAMPICIN'] == 'S') | (master_df['RIFAMPICIN'] == 'R')]
print(len(rifampicin_full))

### for each gene csv only keep the isolates that are in the resistance table. annotate them as resistant or susceptible
gene_df = pd.read_csv(r"C:\Users\Noah Legall\Downloads\fasta_files\output_csv\acpM-kasA_20201206.csv")

gene_df["sample_id"] = gene_df['sequence_id'].str.extract(r'genomic_data/([^/]+)/pilon')

filtered_gene_df = gene_df[gene_df["sample_id"].isin(rifampicin_full["Isolate"])]

annotated_filtered_gene_df = filtered_gene_df.merge(rifampicin_full[["Isolate", "RIFAMPICIN"]], left_on="sample_id", right_on="Isolate", how="left")
print(annotated_filtered_gene_df['RIFAMPICIN'].head())

### for one drug resistance, e.g. rifampicin, compute the mutual information of the columns with the resistance label in each gene. keep the top 100 columns with highest mutual information

  master_df = pd.read_csv(r"C:\Users\Noah Legall\LegallLab\SHAP-mTB-AMR\resistance_dataset\master_table_resistance.csv")


22616
0    R
1    R
2    R
3    S
4    R
Name: RIFAMPICIN, dtype: object


In [15]:
len(annotated_filtered_gene_df.columns)

1683

In [None]:
### for one drug resistance, e.g. rifampicin, compute the mutual information of the columns with the resistance label in each gene. keep the top 100 columns with highest mutual information


# Separate features and target
X = pd.get_dummies(annotated_filtered_gene_df.drop(columns=["RIFAMPICIN","sample_id","sequence_id"]))  # encode categorical variables
y = annotated_filtered_gene_df["RIFAMPICIN"]

print("data prepared")

data prepared
acpM-kasA_20201206_439_C    0.021549
Isolate_SAMN03648444        0.014483
Isolate_SAMN08795284        0.013316
Isolate_SAMN08912738        0.013257
acpM-kasA_20201206_439_T    0.013161
                              ...   
Isolate_TDR24S169L002       0.000000
Isolate_TDR37combined       0.000000
Isolate_TDR47S171L002       0.000000
Isolate_TDR48S90L002        0.000000
Isolate_SAMN09492289        0.000000
Length: 19215, dtype: float64


In [None]:
numeric_var = X.var(numeric_only=True).sort_values(ascending=False)
top_100_cols = numeric_var.head(100).index
X_top_100 = X[top_100_cols]
print("Top 100 columns are found")

In [None]:
### adapt the code to do this for each fasta file in the folder
import glob 
import os
import pandas as pd

# first 5 are highly represented - other 5 are lowly represented
drugs = ["RIFAMPICIN","ISONIAZID", "ETHAMBUTOL", "PYRAZINAMIDE", "STREPTOMYCIN", "PROTHIONAMIDE", "LEVOFLOXACIN", "CIPROFLOXACIN", "AMOXICILLIN", "CYCLOSERINE"]
master_df = pd.read_csv(r"C:\Users\Noah Legall\LegallLab\SHAP-mTB-AMR\resistance_dataset\master_table_resistance.csv")
output_folder = r"C:\Users\Noah Legall\Downloads\fasta_files\output_csv"

# Get all fasta files in the folder
gene_csv_files = glob.glob(os.path.join(output_folder, "*.csv"))
print(gene_csv_files)
for drug in drugs:
    gene_csvs = []

    for gene in gene_csv_files:
        ### for each gene csv only keep the isolates that are in the resistance table. annotate them as resistant or susceptible

        drug_full = master_df[(master_df[drug] == 'S') | (master_df[drug] == 'R')]
        
        gene_df = pd.read_csv(gene)

        gene_df["sample_id"] = gene_df['sequence_id'].str.extract(r'genomic_data/([^/]+)/pilon')

        filtered_gene_df = gene_df[gene_df["sample_id"].isin(drug_full["Isolate"])]

        annotated_filtered_gene_df = filtered_gene_df.merge(drug_full[["Isolate", drug]], left_on="sample_id", right_on="Isolate", how="left")

        # Separate features and target
        isolates = annotated_filtered_gene_df["Isolate"]
        X = pd.get_dummies(annotated_filtered_gene_df.drop(columns=[drug,"sample_id","sequence_id","Isolate"]))  # encode categorical variables
        y = annotated_filtered_gene_df[drug]

        numeric_var = X.var(numeric_only=True).sort_values(ascending=False)
        top_100_cols = numeric_var.head(100).index
        X_top_100 = X[top_100_cols]
        gene_csvs.append(X_top_100)
        

    ### take the dataframes collected and merge them together for the final dataset
    final_data_output = pd.concat(gene_csvs, axis=1)
    final_data_output["Resistance"] = y
    final_data_output["Isolate"] = isolates
    final_data_output.to_csv(fr"C:\Users\Noah Legall\LegallLab\SHAP-mTB-AMR\resistance_dataset\{drug}_data.csv", index=False)
    print(f"Completed for {gene}. Dataset created")

  master_df = pd.read_csv(r"C:\Users\Noah Legall\LegallLab\SHAP-mTB-AMR\resistance_dataset\master_table_resistance.csv")


['C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\acpM-kasA_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\aftB-ubiA_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\clpC_20201213.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\eis_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\embCAB_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\ethAR_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\FabG1-inhA_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\gid_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\gyrBA_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\KatG_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\oxyR-ahpC_20201206.csv', 'C:\\Users\\Noah Legall\\Downloads\\fasta_files\\output_csv\\panD_20201213.csv', 'C:\

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

Completed for C:\Users\Noah Legall\Downloads\fasta_files\output_csv\tlyA_20201206.csv. Dataset created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolates.values  # add back the isolate column for merging later
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_top_100["Isolate"] = isolat

In [42]:
### take the dataframes collected and merge them together for the final dataset
final_data_output = pd.concat(gene_csvs, axis=1)
final_data_output["Resistance"] = y
final_data_output.to_csv(fr"C:\Users\Noah Legall\LegallLab\SHAP-mTB-AMR\resistance_dataset\{drug}_data.csv", index=False)
