In [1]:
import pandas as pd
import os
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import load_model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc




In [2]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [3]:
def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = sequence[::-1]  # Reverse the sequence
    reverse_complement_seq = ''.join(complement[base] for base in reverse_seq)
    return reverse_complement_seq

def reverse_complement_list(sequence_list):
    reverse_complements = []
    for seq in sequence_list:
        reverse_complements.append(reverse_complement(seq))
    return reverse_complements

In [4]:
#This function gets the cgr for the sequences
def easy_cgr(dna_sequence):
  nucleotide_map = {
    "A": (1, 1),
    "T": (-1, -1),
    "C": (1, -1),
    "G": (-1, 1),
}
  values=[]
  point=(0,0)
  for nucleotide in dna_sequence:
        next_point=nucleotide_map[nucleotide]
        point=((point[0]+next_point[0])/2,(point[1]+next_point[1])/2)
        values.append(point)
  return values

In [5]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()

In [6]:
#This function reads the file when usingantibiotic resistance genes
import re

def read_antibiotic_resistance_genes(fasta_file_path):
    # Initialize lists to store sequence data
    sequence_ids = []
    sequences = []
    terms_inside_brackets = []  # Initialize a list to store terms inside brackets

    # Define a regular expression pattern to match text inside square brackets
    pattern = r'\[([^]]+)\]'

    # Open the FASTA file for reading
    with open(fasta_file_path, "r") as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace

            # Check if the line is an ID line (starts with '>')
            if line.startswith(">"):
                # If it's not the first sequence, process the previous one
                if sequence_ids:
                    term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
                    terms_inside_brackets.append(term_inside_brackets)  # Append to the list
                # Extract the sequence ID from the ID line
                sequence_ids.append(line[1:])
                sequences.append("")  # Reset the sequence
            else:
                # Append the line to the sequence
                sequences[-1] += line

    # Process the last sequence in the file
    if sequence_ids:
        term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
        terms_inside_brackets.append(term_inside_brackets)

    return sequences, terms_inside_brackets

In [7]:
from Bio import SeqIO

def read_fasta_patients(file_path):
    sequence_ids = []
    sequences = []
    
    try:
        with open(file_path, "r") as fasta_file:
            fasta_sequences = SeqIO.parse(fasta_file, "fasta")
            
            for seq_record in fasta_sequences:
                sequence_ids.append(seq_record.id)
                sequences.append(str(seq_record.seq))
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
    
    return sequences, sequence_ids


In [8]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()


In [9]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(84 / dpi, 84 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images(sequences, ids, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    

    for i, (sequence, label) in enumerate(zip(sequences, ids)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{i}_{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

In [11]:

def read_fasta_ARG(file_name):
    sequences = []
    with open(file_name, 'r') as file:
        sequence_id = ''
        sequence = ''
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if sequence_id != '':
                    sequences.append(sequence_id.split('|') + [sequence])
                sequence_id = line[1:]
                sequence = ''
            else:
                sequence += line
        if sequence_id != '':
            sequences.append(sequence_id.split('|') + [sequence])
    return sequences

###  ARG PATIENT

In [12]:
import pandas as pd

file_name = "Patient2.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)
patient_2 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [13]:
# Define a function to extract the desired pattern
def extract_id_2(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_2'
    else:
        return None

# Apply the function to the 'ids' column
patient_2['ids'] = patient_2['ids'].apply(extract_id_2)

In [14]:
patient_ids = patient_2['ids'].tolist()[:15000]
patient_list = patient_2['seq'].tolist()[:15000]

In [15]:
pre_synthetic_rows = patient_2.iloc[15000:20068]

# Convert to DataFrame
pre_synthetic_rows_df = pd.DataFrame(pre_synthetic_rows)

In [16]:
pre_synthetic_rows_df

Unnamed: 0,ids,seq
15000,k127_321743_2,GTTCAGGCTCAAGCTCTCGAAAAAGGCGGATCTCTTCGGCATGGGT...
15001,k127_156623_2,GGATGAAAGCGCTCATGCCTTTAGCGCCTTTGGATTTGTCCGTGGA...
15002,k127_184179_2,GCCTAGCGGTGAAATTAAAAAGGGTACTGTCAGCAAAGCTGTGGTT...
15003,k127_46124_2,TTCATACTCCTTGTCTTCCAGACGGATTATCAGTTCCCCCTTACTT...
15004,k127_266552_2,CTGGATGATAAGAATTTTGTTTTTGAATCTTGGCTTACTCCCGATG...
...,...,...
20063,k127_248203_2,ATATACCACAAGCCCAATTAGCCGTTAACAGTATCAGTATACCCAC...
20064,k127_53160_2,CTAAAGTAGTATAAGGAATCATTTGTGTGCCTTTCTATGAATGTAA...
20065,k127_24089_2,ATGTGGGAGTTGGCGGTGGCGCTGTCCAGCAGGACGCGGTCATCCG...
20066,k127_163421_2,TAGAATGCGCCTCTACGGAGGTAGAAAATTTTGAGCGGTGCGCAGC...


In [17]:
import pandas as pd

file_name = "AntibioticResistanceGenes.fasta"
fasta_sequences = read_fasta_ARG(file_name)

In [18]:
# Convert to DataFrame
data_antibiotic = pd.DataFrame(fasta_sequences, columns=['ID Prefix', 'ID', 'Strand', 'Location',"ARO", 'Description', 'Sequence'])


data_antibiotic["Description"] = data_antibiotic["Description"].str.replace(' ', '_')
data_antibiotic.rename(columns={'Description': 'ids', 'Sequence': 'seq'}, inplace=True)

In [19]:
data_antibiotic

Unnamed: 0,ID Prefix,ID,Strand,Location,ARO,ids,seq
0,gb,GQ343019.1,+,132-1023,ARO:3002999,CblA-1_[mixed_culture_bacterium_AX_gF3SD01_15],ATGAAAGCATATTTCATCGCCATACTTACCTTATTCACTTGTATAG...
1,gb,HQ845196.1,+,0-861,ARO:3001109,SHV-52_[Klebsiella_pneumoniae],ATGCGTTATATTCGCCTGTGTATTATCTCCCTGTTAGCCGCCCTGC...
2,gb,AF028812.1,+,392-887,ARO:3002867,dfrF_[Enterococcus_faecalis],ATGATAGGTTTGATTGTTGCGAGGTCAAAGAATAATGTTATAGGCA...
3,gb,JX017365.1,+,244-1120,ARO:3001989,CTX-M-130_[Escherichia_coli],ATGGTGACAAAGAGAGTGCAACGGATGATGTTCGCGGCGGCGGCGT...
4,gb,JN967644.1,+,0-813,ARO:3002356,NDM-6_[Escherichia_coli],ATGGAATTGCCCAATATTATGCACCCGGTCGCGAAGCTGAGCACCG...
...,...,...,...,...,...,...,...
5063,gb,NC_021002.1,-,901870-904756,ARO:3004177,Mfer_23S_MAC_[Mycoplasmopsis_fermentans_PG18],CTTTTAAACCGATCGATTTATTAGTATTGGTCAGCTCAACGTATTA...
5064,gb,U00096.1,+,4166659-4168200,ARO:3003411,Ecol_16S_TET_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5065,gb,U00096.1,+,223771-225312,ARO:3003372,Ecol_16rrsH_SPT_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5066,gb,NC_005353.1,+,38549-40023,ARO:3003978,Crei_16rrnS_STR_[Chlamydomonas_reinhardtii],ATCCATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCAT...


In [20]:
df_ids_antibiotics = data_antibiotic[["ids", "seq"]]

In [21]:
df_ids_antibiotics

Unnamed: 0,ids,seq
0,CblA-1_[mixed_culture_bacterium_AX_gF3SD01_15],ATGAAAGCATATTTCATCGCCATACTTACCTTATTCACTTGTATAG...
1,SHV-52_[Klebsiella_pneumoniae],ATGCGTTATATTCGCCTGTGTATTATCTCCCTGTTAGCCGCCCTGC...
2,dfrF_[Enterococcus_faecalis],ATGATAGGTTTGATTGTTGCGAGGTCAAAGAATAATGTTATAGGCA...
3,CTX-M-130_[Escherichia_coli],ATGGTGACAAAGAGAGTGCAACGGATGATGTTCGCGGCGGCGGCGT...
4,NDM-6_[Escherichia_coli],ATGGAATTGCCCAATATTATGCACCCGGTCGCGAAGCTGAGCACCG...
...,...,...
5063,Mfer_23S_MAC_[Mycoplasmopsis_fermentans_PG18],CTTTTAAACCGATCGATTTATTAGTATTGGTCAGCTCAACGTATTA...
5064,Ecol_16S_TET_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5065,Ecol_16rrsH_SPT_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5066,Crei_16rrnS_STR_[Chlamydomonas_reinhardtii],ATCCATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCAT...


### CREATE THE SYNTHETIC

In [22]:
import pandas as pd
import random

# Assuming pre_synthetic_rows_df and df_ids_antibiotics are your DataFrames

# Iterate through rows and merge sequences
merged_seqs = []
merged_ids = []

for idx, row in pre_synthetic_rows_df.iterrows():
    synthetic_seq = row['seq']
    antibiotic_seq = df_ids_antibiotics.iloc[idx % len(df_ids_antibiotics)]['seq']
    
    # Randomly select insertion point in synthetic sequence
    insertion_point = random.randint(0, len(synthetic_seq))
    
    # Insert antibiotic sequence into synthetic sequence
    merged_seq = synthetic_seq[:insertion_point] + antibiotic_seq + synthetic_seq[insertion_point:]
    
    # Combine ids with underscores
    merged_id = row['ids'] + '_' + df_ids_antibiotics.iloc[idx % len(df_ids_antibiotics)]['ids']
    
    # Check lengths of sequences before and after merging
    synthetic_length = len(synthetic_seq)
    antibiotic_length = len(antibiotic_seq)
    merged_length = len(merged_seq)
    #print(f"Synthetic Seq Length: {synthetic_length}, Antibiotic Seq Length: {antibiotic_length}, Merged Seq Length: {merged_length}")
    
    merged_seqs.append(merged_seq)
    merged_ids.append(merged_id)

# Create DataFrame from merged sequences and ids
merged_df = pd.DataFrame({'ids': merged_ids, 'seq': merged_seqs})






In [23]:
merged_df

Unnamed: 0,ids,seq
0,k127_321743_2_Mmor_gyrB_FLO_[Morganella_morgan...,GTTCAGGCTCAAGCTCTCGAAAAAGGCGGATCTCTTCGGCATGGGT...
1,k127_156623_2_Mlep_gyrA_FLO_[Mycobacterium_lep...,GGATGAAAGCGCTCATGCCTTTAGCGCCTTTGGATTTGTCCGTGGA...
2,k127_184179_2_Sser_gyrB_FLO_[Salmonella_enteri...,GCCTAGCGGTGAAATTAAAAAGGGTACTGTCAGCAAAGCTGTGGTT...
3,k127_46124_2_Kaer_Omp36_[Klebsiella_aerogenes],TTCATACTCCTTGTCTTCCAGACGGATTATCAGTTCCCCCTTACTT...
4,k127_266552_2_Sent_ramR_[Salmonella_enterica_s...,CTGGATGATAAGAATTTTGTTTTTGAATCTTGGCTTACTCCCGATG...
...,...,...
5063,k127_248203_2_Ngon_porin_[Neisseria_gonorrhoeae],ATATACCACAAGCCCAATTAGCCGTTAACAGTATCAGTATACCCAC...
5064,k127_53160_2_Mtub_embC_EMB_[Mycobacterium_tube...,CTAAAGTAGTATAAGGAATCATTTGTGTGCCTTTCTATGAATGTAA...
5065,k127_24089_2_Mlep_gyrB_FLO_[Mycobacterium_lepr...,ATGTGGGAGTTGGCGGTGGCGCTGTCCAGCAGGACGCGGTCATCCG...
5066,k127_163421_2_Mtub_gyrB_FLO_[Mycobacterium_tub...,TAGAATGCGCCTCTACGGAGGTAGAAAATTTTGAGCGGTGCGCAGC...


In [24]:
import pandas as pd


# Iterate over the rows of the DataFrame
new_rows = []
for index, row in merged_df.iterrows():
    # Reverse the sequence
    reversed_seq = row['seq'][::-1]
    
    # Replace each nucleotide with its conjugate
    conjugate_seq = reversed_seq.translate(str.maketrans('ATCG', 'TAGC'))
    
    # Create a new row with the modified values
    new_row = {'ids': row['ids'] + '_inv', 'seq': conjugate_seq}
    
    # Append the new row to the list
    new_rows.append(new_row)

# Convert the list of dictionaries to a DataFrame
df_additional = pd.DataFrame(new_rows)

# Concatenate the original DataFrame with the new DataFrame
df_antibiotics_and_conjugate = pd.concat([merged_df, df_additional], ignore_index=True)



In [25]:
df_antibiotics_and_conjugate

Unnamed: 0,ids,seq
0,k127_321743_2_Mmor_gyrB_FLO_[Morganella_morgan...,GTTCAGGCTCAAGCTCTCGAAAAAGGCGGATCTCTTCGGCATGGGT...
1,k127_156623_2_Mlep_gyrA_FLO_[Mycobacterium_lep...,GGATGAAAGCGCTCATGCCTTTAGCGCCTTTGGATTTGTCCGTGGA...
2,k127_184179_2_Sser_gyrB_FLO_[Salmonella_enteri...,GCCTAGCGGTGAAATTAAAAAGGGTACTGTCAGCAAAGCTGTGGTT...
3,k127_46124_2_Kaer_Omp36_[Klebsiella_aerogenes],TTCATACTCCTTGTCTTCCAGACGGATTATCAGTTCCCCCTTACTT...
4,k127_266552_2_Sent_ramR_[Salmonella_enterica_s...,CTGGATGATAAGAATTTTGTTTTTGAATCTTGGCTTACTCCCGATG...
...,...,...
10131,k127_248203_2_Ngon_porin_[Neisseria_gonorrhoea...,GTGCTAATGTGACGGTATTGTCTTCGGTGGAAGATGTAGAATTCAT...
10132,k127_53160_2_Mtub_embC_EMB_[Mycobacterium_tube...,CTCTGTGATAAATTCTACACGACGGGACGGTACATTTTTCGCTGCT...
10133,k127_24089_2_Mlep_gyrB_FLO_[Mycobacterium_lepr...,TCCCCAGAAGATAGGATAGTAATACGCCGGGCGGACAACGCCAATC...
10134,k127_163421_2_Mtub_gyrB_FLO_[Mycobacterium_tub...,CACCCGGAATTATGCGGTAATAGTTTTATTATACCGCAATTCCGGC...


In [26]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = df_antibiotics_and_conjugate[df_antibiotics_and_conjugate.duplicated('ids')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate ids in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate ids in the DataFrame.")


There are no duplicate ids in the DataFrame.


In [27]:
# Replace "/" with "$" in the "ids" column
df_antibiotics_and_conjugate['ids'] = df_antibiotics_and_conjugate['ids'].str.replace("/", "$")

In [28]:
antibiotic_ids = df_antibiotics_and_conjugate['ids'].tolist()
antibiotic_list = df_antibiotics_and_conjugate['seq'].tolist()


In [29]:
len(antibiotic_list)


10136

In [30]:
len(antibiotic_ids)

10136

In [31]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(100 / dpi, 100 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images_label(sequences, class_labels, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i, (sequence, label) in enumerate(zip(sequences, class_labels)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)


In [32]:
import time
start_time = time.time()
seq_num=[]


all_seq = antibiotic_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = antibiotic_ids 
output_directory = "ARG_and_inv_conjugates_images_synthetic_patient_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 553.3582837581635 seconds


###  ARG HEALTHY

In [33]:
import pandas as pd

file_name = "Healthy20.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)
healthy_20 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])






In [34]:
# Define a function to extract the desired pattern
def extract_id_20(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_20'
    else:
        return None

# Apply the function to the 'ids' column
healthy_20['ids'] = healthy_20['ids'].apply(extract_id_20)

In [35]:
healthy_20

Unnamed: 0,ids,seq
0,k127_30519_20,ATATATATGTATGCCCTATGCACTACACAGTATCATTAGAAAGACA...
1,k127_54251_20,AGATGCTTACAAGGAATTAAATCCGGCAGATGTAGCTTACTATGAT...
2,k127_91540_20,CCTTATTGTTTAACGATGAAAAGAATATTGACAATCGTATAGGAGA...
3,k127_98319_20,GTCCCACACCGGGGTCAGGCTAGTCTGCCGCTGTTCCCCGCTGCGG...
4,k127_10173_20,GGATATGATGGAGCTGGCCCAGGAGTACCGCACCAAGCTGCTGGAC...
...,...,...
14888,k127_30435_20,TCATGCAGTTTGGGCCTTCCTTACTGCATCTTCGGTCAATCATATC...
14889,k127_30436_20,CTTATCATTCGGCTGTACTCCGACTTGTTCTATATAGCTGTGCGGA...
14890,k127_30446_20,CCGATAAAACCGGACAATGTGCGTTACAGAGCAAAACGACTGTTAC...
14891,k127_30447_20,ATATATAGACATTTACTCTCTGGGAATTTGGAGCTTAGTTCTAATT...


In [36]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_20[healthy_20.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")

There are no duplicate seq in the DataFrame.


In [37]:
pre_synthetic_rows = healthy_20.iloc[5000:10068]

# Convert to DataFrame
pre_synthetic_rows_df = pd.DataFrame(pre_synthetic_rows)

In [38]:
pre_synthetic_rows_df

Unnamed: 0,ids,seq
5000,k127_18120_20,GTATATGGTTGGTATGACCGGCTGGCCAGACAGCACGCTCTGGCCA...
5001,k127_45057_20,GTGTAAGACGAGACCAAGAAATTTTAATTGGAGGGAGGTGAACTCT...
5002,k127_109587_20,CAACAGGAAGCCCCTCTGTGTACTCATTCGCAGGCCAAACACCCTC...
5003,k127_92322_20,AGAATCACAGCGTATAAGAAAAGAATCTTTAAATGCATTGGATGGA...
5004,k127_112841_20,GCAAAAAGCGGGAACGACACCCCGGGATGCCCGCCATGCGGAGGCG...
...,...,...
10063,k127_100599_20,CCATTCGCCTGCCAATTTAATACGTCTTCTCATGATCCCGCTTGGA...
10064,k127_73671_20,AAACCATGGCGGACTGTATCGTAACCTGCAACTGTTGCGTGATACC...
10065,k127_86823_20,CAAACGCAATGAGTACACGTGCAACGGCGGATAACGGTTCCGACGT...
10066,k127_46183_20,AGTATAGTATTATCACAGGTTTGCCACAAGATTTAAGCTTCAAACT...


In [39]:
import pandas as pd

file_name = "AntibioticResistanceGenes.fasta"
fasta_sequences = read_fasta_ARG(file_name)

In [40]:
# Convert to DataFrame
data_antibiotic = pd.DataFrame(fasta_sequences, columns=['ID Prefix', 'ID', 'Strand', 'Location',"ARO", 'Description', 'Sequence'])


data_antibiotic["Description"] = data_antibiotic["Description"].str.replace(' ', '_')
data_antibiotic.rename(columns={'Description': 'ids', 'Sequence': 'seq'}, inplace=True)

In [41]:
data_antibiotic

Unnamed: 0,ID Prefix,ID,Strand,Location,ARO,ids,seq
0,gb,GQ343019.1,+,132-1023,ARO:3002999,CblA-1_[mixed_culture_bacterium_AX_gF3SD01_15],ATGAAAGCATATTTCATCGCCATACTTACCTTATTCACTTGTATAG...
1,gb,HQ845196.1,+,0-861,ARO:3001109,SHV-52_[Klebsiella_pneumoniae],ATGCGTTATATTCGCCTGTGTATTATCTCCCTGTTAGCCGCCCTGC...
2,gb,AF028812.1,+,392-887,ARO:3002867,dfrF_[Enterococcus_faecalis],ATGATAGGTTTGATTGTTGCGAGGTCAAAGAATAATGTTATAGGCA...
3,gb,JX017365.1,+,244-1120,ARO:3001989,CTX-M-130_[Escherichia_coli],ATGGTGACAAAGAGAGTGCAACGGATGATGTTCGCGGCGGCGGCGT...
4,gb,JN967644.1,+,0-813,ARO:3002356,NDM-6_[Escherichia_coli],ATGGAATTGCCCAATATTATGCACCCGGTCGCGAAGCTGAGCACCG...
...,...,...,...,...,...,...,...
5063,gb,NC_021002.1,-,901870-904756,ARO:3004177,Mfer_23S_MAC_[Mycoplasmopsis_fermentans_PG18],CTTTTAAACCGATCGATTTATTAGTATTGGTCAGCTCAACGTATTA...
5064,gb,U00096.1,+,4166659-4168200,ARO:3003411,Ecol_16S_TET_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5065,gb,U00096.1,+,223771-225312,ARO:3003372,Ecol_16rrsH_SPT_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5066,gb,NC_005353.1,+,38549-40023,ARO:3003978,Crei_16rrnS_STR_[Chlamydomonas_reinhardtii],ATCCATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCAT...


In [42]:
df_ids_antibiotics = data_antibiotic[["ids", "seq"]]

In [43]:
df_ids_antibiotics

Unnamed: 0,ids,seq
0,CblA-1_[mixed_culture_bacterium_AX_gF3SD01_15],ATGAAAGCATATTTCATCGCCATACTTACCTTATTCACTTGTATAG...
1,SHV-52_[Klebsiella_pneumoniae],ATGCGTTATATTCGCCTGTGTATTATCTCCCTGTTAGCCGCCCTGC...
2,dfrF_[Enterococcus_faecalis],ATGATAGGTTTGATTGTTGCGAGGTCAAAGAATAATGTTATAGGCA...
3,CTX-M-130_[Escherichia_coli],ATGGTGACAAAGAGAGTGCAACGGATGATGTTCGCGGCGGCGGCGT...
4,NDM-6_[Escherichia_coli],ATGGAATTGCCCAATATTATGCACCCGGTCGCGAAGCTGAGCACCG...
...,...,...
5063,Mfer_23S_MAC_[Mycoplasmopsis_fermentans_PG18],CTTTTAAACCGATCGATTTATTAGTATTGGTCAGCTCAACGTATTA...
5064,Ecol_16S_TET_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5065,Ecol_16rrsH_SPT_[Escherichia_coli_str._K-12],AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...
5066,Crei_16rrnS_STR_[Chlamydomonas_reinhardtii],ATCCATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCAT...


### CREATE THE SYNTHETIC

In [44]:
import pandas as pd
import random

# Assuming pre_synthetic_rows_df and df_ids_antibiotics are your DataFrames

# Iterate through rows and merge sequences
merged_seqs = []
merged_ids = []

for idx, row in pre_synthetic_rows_df.iterrows():
    synthetic_seq = row['seq']
    antibiotic_seq = df_ids_antibiotics.iloc[idx % len(df_ids_antibiotics)]['seq']
    
    # Randomly select insertion point in synthetic sequence
    insertion_point = random.randint(0, len(synthetic_seq))
    
    # Insert antibiotic sequence into synthetic sequence
    merged_seq = synthetic_seq[:insertion_point] + antibiotic_seq + synthetic_seq[insertion_point:]
    
    # Combine ids with underscores
    merged_id = row['ids'] + '_' + df_ids_antibiotics.iloc[idx % len(df_ids_antibiotics)]['ids']
    
    # Check lengths of sequences before and after merging
    synthetic_length = len(synthetic_seq)
    antibiotic_length = len(antibiotic_seq)
    merged_length = len(merged_seq)
    #print(f"Synthetic Seq Length: {synthetic_length}, Antibiotic Seq Length: {antibiotic_length}, Merged Seq Length: {merged_length}")
    
    merged_seqs.append(merged_seq)
    merged_ids.append(merged_id)

# Create DataFrame from merged sequences and ids
merged_df = pd.DataFrame({'ids': merged_ids, 'seq': merged_seqs})






In [45]:
merged_df

Unnamed: 0,ids,seq
0,k127_18120_20_Cpsi_16S_SPT_[Chlamydia_psittaci...,GTATATGGTTGGTATGACCGGCTGGCCAGACAGCACGCTCTGGCCA...
1,k127_45057_20_Bbur_16S_GEN_[Borreliella_burgdo...,GTGTAAGACGAGACCAAGAAATTTTAATTGGAGGGAGGTGAACTCT...
2,k127_109587_20_Mtub_16S_KAN_[Mycobacterium_tub...,CAACAGGAAGCCCCTCTGTGTACTCATTCGCAGGCCAAACACCCTC...
3,k127_92322_20_Bbur_16S_KAN_[Borreliella_burgdo...,AGAATCACAGCGTATAAGAAAAGAATCTTTAAATGCATTGGATGGA...
4,k127_112841_20_Bbur_16S_SPT_[Borreliella_burgd...,GCAAAAAGCGGGAACGACACCCCGGGATGCCCGCCATGCGGAGGCG...
...,...,...
5063,k127_100599_20_Ecol_16rrsB_PAR_[Escherichia_co...,CCATTCGCCTGCCAATTTAATACGTCTTCTCATGATCCCGCTTGGA...
5064,k127_73671_20_Ecol_16rrsB_TOB_[Escherichia_col...,AAACCATGGCGGACTGTATCGTAACCTGCAACTGTTGCGTGATACC...
5065,k127_86823_20_Ecol_16rrsB_SPT_[Escherichia_col...,CAAACGCAATGAGTACACGTGCAACGGCGGATAACGGTTCCGACGT...
5066,k127_46183_20_Mtub_16S_VIO_[Mycobacterium_tube...,AGTATAGTATTATCACAGGTTTGCCACAAGATTTAAGCTTCAAACT...


In [46]:
import pandas as pd


# Iterate over the rows of the DataFrame
new_rows = []
for index, row in merged_df.iterrows():
    # Reverse the sequence
    reversed_seq = row['seq'][::-1]
    
    # Replace each nucleotide with its conjugate
    conjugate_seq = reversed_seq.translate(str.maketrans('ATCG', 'TAGC'))
    
    # Create a new row with the modified values
    new_row = {'ids': row['ids'] + '_inv', 'seq': conjugate_seq}
    
    # Append the new row to the list
    new_rows.append(new_row)

# Convert the list of dictionaries to a DataFrame
df_additional = pd.DataFrame(new_rows)

# Concatenate the original DataFrame with the new DataFrame
df_antibiotics_and_conjugate = pd.concat([merged_df, df_additional], ignore_index=True)



In [47]:
df_antibiotics_and_conjugate

Unnamed: 0,ids,seq
0,k127_18120_20_Cpsi_16S_SPT_[Chlamydia_psittaci...,GTATATGGTTGGTATGACCGGCTGGCCAGACAGCACGCTCTGGCCA...
1,k127_45057_20_Bbur_16S_GEN_[Borreliella_burgdo...,GTGTAAGACGAGACCAAGAAATTTTAATTGGAGGGAGGTGAACTCT...
2,k127_109587_20_Mtub_16S_KAN_[Mycobacterium_tub...,CAACAGGAAGCCCCTCTGTGTACTCATTCGCAGGCCAAACACCCTC...
3,k127_92322_20_Bbur_16S_KAN_[Borreliella_burgdo...,AGAATCACAGCGTATAAGAAAAGAATCTTTAAATGCATTGGATGGA...
4,k127_112841_20_Bbur_16S_SPT_[Borreliella_burgd...,GCAAAAAGCGGGAACGACACCCCGGGATGCCCGCCATGCGGAGGCG...
...,...,...
10131,k127_100599_20_Ecol_16rrsB_PAR_[Escherichia_co...,TGAGAGCCAGTGCAAAGGGCGGATACATAAAATAATCGGATGGAAA...
10132,k127_73671_20_Ecol_16rrsB_TOB_[Escherichia_col...,AAATCCAGTCAACCCCGGATATTTATAACTTGCAGAAGTTATGTGG...
10133,k127_86823_20_Ecol_16rrsB_SPT_[Escherichia_col...,CTTTTATAGATTGTATTATTGATACTACTAAAAATAGTTGGAAATA...
10134,k127_46183_20_Mtub_16S_VIO_[Mycobacterium_tube...,CAATTTTCATATGAAGCTTCTTAAAAGTACTTTTGTTTGGTTAGCG...


In [48]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = df_antibiotics_and_conjugate[df_antibiotics_and_conjugate.duplicated('ids')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate ids in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate ids in the DataFrame.")


There are no duplicate ids in the DataFrame.


In [49]:
# Replace "/" with "$" in the "ids" column
df_antibiotics_and_conjugate['ids'] = df_antibiotics_and_conjugate['ids'].str.replace("/", "$")

In [50]:
antibiotic_ids = df_antibiotics_and_conjugate['ids'].tolist()
antibiotic_list = df_antibiotics_and_conjugate['seq'].tolist()


In [51]:
len(antibiotic_list)


10136

In [52]:
len(antibiotic_ids)

10136

In [53]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(100 / dpi, 100 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images_label(sequences, class_labels, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i, (sequence, label) in enumerate(zip(sequences, class_labels)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)


In [54]:
import time
start_time = time.time()
seq_num=[]


all_seq = antibiotic_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = antibiotic_ids 
output_directory = "ARG_and_inv_conjugates_images_synthetic_healthy_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 930.5839138031006 seconds
