In [1]:
import pandas as pd
import os
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import load_model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc




In [2]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [3]:
def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = sequence[::-1]  # Reverse the sequence
    reverse_complement_seq = ''.join(complement[base] for base in reverse_seq)
    return reverse_complement_seq

def reverse_complement_list(sequence_list):
    reverse_complements = []
    for seq in sequence_list:
        reverse_complements.append(reverse_complement(seq))
    return reverse_complements

In [4]:
#This function gets the cgr for the sequences
def easy_cgr(dna_sequence):
  nucleotide_map = {
    "A": (1, 1),
    "T": (-1, -1),
    "C": (1, -1),
    "G": (-1, 1),
}
  values=[]
  point=(0,0)
  for nucleotide in dna_sequence:
        next_point=nucleotide_map[nucleotide]
        point=((point[0]+next_point[0])/2,(point[1]+next_point[1])/2)
        values.append(point)
  return values

In [5]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()

In [6]:
#This function reads the file when usingantibiotic resistance genes
import re

def read_antibiotic_resistance_genes(fasta_file_path):
    # Initialize lists to store sequence data
    sequence_ids = []
    sequences = []
    terms_inside_brackets = []  # Initialize a list to store terms inside brackets

    # Define a regular expression pattern to match text inside square brackets
    pattern = r'\[([^]]+)\]'

    # Open the FASTA file for reading
    with open(fasta_file_path, "r") as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace

            # Check if the line is an ID line (starts with '>')
            if line.startswith(">"):
                # If it's not the first sequence, process the previous one
                if sequence_ids:
                    term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
                    terms_inside_brackets.append(term_inside_brackets)  # Append to the list
                # Extract the sequence ID from the ID line
                sequence_ids.append(line[1:])
                sequences.append("")  # Reset the sequence
            else:
                # Append the line to the sequence
                sequences[-1] += line

    # Process the last sequence in the file
    if sequence_ids:
        term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
        terms_inside_brackets.append(term_inside_brackets)

    return sequences, terms_inside_brackets

In [7]:
from Bio import SeqIO

def read_fasta_patients(file_path):
    sequence_ids = []
    sequences = []
    
    try:
        with open(file_path, "r") as fasta_file:
            fasta_sequences = SeqIO.parse(fasta_file, "fasta")
            
            for seq_record in fasta_sequences:
                sequence_ids.append(seq_record.id)
                sequences.append(str(seq_record.seq))
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
    
    return sequences, sequence_ids


In [8]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()


In [9]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(84 / dpi, 84 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images(sequences, ids, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    

    for i, (sequence, label) in enumerate(zip(sequences, ids)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{i}_{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

In [11]:

def read_fasta_ARG(file_name):
    sequences = []
    with open(file_name, 'r') as file:
        sequence_id = ''
        sequence = ''
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if sequence_id != '':
                    sequences.append(sequence_id.split('|') + [sequence])
                sequence_id = line[1:]
                sequence = ''
            else:
                sequence += line
        if sequence_id != '':
            sequences.append(sequence_id.split('|') + [sequence])
    return sequences

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(100 / dpi, 100 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images_label(sequences, class_labels, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i, (sequence, label) in enumerate(zip(sequences, class_labels)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

###  Patient 11

In [13]:
import pandas as pd

file_name = "Patient11.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [14]:
# Convert to DataFrame
patient_11 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [15]:
patient_11

Unnamed: 0,ids,seq
0,k127_34166 flag=1 multi=5.4219 len=2675,ATTGTAATTGTTTCCGCCGCTGTATTTGCAGTTTTCGCCCAGCAGG...
1,k127_119569 flag=1 multi=6.0000 len=3361,CCGCAATTGATGCCTTGGGGACTCTCTTCCCAAGTAGCGCCGACGA...
2,k127_13 flag=1 multi=6.0000 len=2747,TGCAAGCCCCTGCAACTTCAGCCGCTGTTCCGGTGCGGTCGCCACT...
3,k127_15 flag=1 multi=4.0000 len=2937,GCTTAAAGGCAGTCGGAGATGACGGAGAAGATTTAGGTGAAGTAAA...
4,k127_85416 flag=1 multi=7.0000 len=7362,AAAAGAAAGCGTTTTTAGCAGAATTGTCGAAAAGCTGAAAAAACGA...
...,...,...
10017,k127_51170 flag=0 multi=5.9709 len=3562,TCAACGCCTGCACTTTATCTGGCATCAAGTCAAAAGGCAGCCAAAG...
10018,k127_51190 flag=1 multi=6.0000 len=4703,ACGTATAATCAATCAAGATACAGTTAATTTAATAAAGAAAATTGTA...
10019,k127_51199 flag=1 multi=5.0000 len=4856,TGCCGTCTGCAGACTGCTCTGGGTGACAAGTATCTGGTAACCGAAG...
10020,k127_51221 flag=1 multi=6.0000 len=2732,CACAGTATAACGGCGCGCTTGGCGCAGCCCTGATGGCATATCAGGT...


In [16]:
# Define a function to extract the desired pattern
def extract_id_11(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_11'
    else:
        return None

# Apply the function to the 'ids' column
patient_11['ids'] = patient_11['ids'].apply(extract_id_11)



In [17]:
patient_11

Unnamed: 0,ids,seq
0,k127_34166_11,ATTGTAATTGTTTCCGCCGCTGTATTTGCAGTTTTCGCCCAGCAGG...
1,k127_119569_11,CCGCAATTGATGCCTTGGGGACTCTCTTCCCAAGTAGCGCCGACGA...
2,k127_13_11,TGCAAGCCCCTGCAACTTCAGCCGCTGTTCCGGTGCGGTCGCCACT...
3,k127_15_11,GCTTAAAGGCAGTCGGAGATGACGGAGAAGATTTAGGTGAAGTAAA...
4,k127_85416_11,AAAAGAAAGCGTTTTTAGCAGAATTGTCGAAAAGCTGAAAAAACGA...
...,...,...
10017,k127_51170_11,TCAACGCCTGCACTTTATCTGGCATCAAGTCAAAAGGCAGCCAAAG...
10018,k127_51190_11,ACGTATAATCAATCAAGATACAGTTAATTTAATAAAGAAAATTGTA...
10019,k127_51199_11,TGCCGTCTGCAGACTGCTCTGGGTGACAAGTATCTGGTAACCGAAG...
10020,k127_51221_11,CACAGTATAACGGCGCGCTTGGCGCAGCCCTGATGGCATATCAGGT...


In [18]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_11[patient_11.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [19]:
patient_ids = patient_11['ids'].tolist()
patient_list = patient_11['seq'].tolist()


In [20]:
len(patient_ids)


10022

In [21]:
len(patient_list)

10022

In [22]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_11_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 538.7846436500549 seconds


###  Patient 12

In [23]:
import pandas as pd

file_name = "Patient12.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [24]:
# Convert to DataFrame
patient_12 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [25]:
patient_12

Unnamed: 0,ids,seq
0,k127_83430 flag=1 multi=5.0000 len=2582,TCCTTTTTCGGTTTGCTTTGGGATATAATAATTTCCTTTTTCAGAA...
1,k127_33374 flag=0 multi=11.4901 len=4273,CCGTTCTTAACGCTCACGAGCCTTGTAAGGCTCGTGGCTAAACGGA...
2,k127_13 flag=0 multi=5.9726 len=2538,TGTAATGACACCAATGAAATATCATGAGCAATATACAAAAGCAGCA...
3,k127_19 flag=1 multi=4.9827 len=3827,GTCGTATATTGACGTCAACGTCCATCCGGCGAAGATGGAGATCCGT...
4,k127_66749 flag=1 multi=6.0000 len=4754,GCGTGACTCGGTCGAATCCCCGGAGGAAGTCGAATCCGCGCAGCTG...
...,...,...
8440,k127_83384 flag=1 multi=7.0000 len=2561,CATCGTTTAACTCTATTGTCAAAAAGCCATTTGCGTCTAATTCTTT...
8441,k127_83385 flag=1 multi=5.0000 len=2518,AATGCACACAGCACCGAGTGCCAAAAGCACCAGCAGAATCAGAACT...
8442,k127_83391 flag=0 multi=8.0000 len=3380,ATCTTACGACTGTCAAAGAATCCCAAACCTGCTGCATTGCTTACAT...
8443,k127_83411 flag=1 multi=4.0000 len=5426,GTGTTCTTCGTTTCTTTCCTGATGCCATACACGTTCCCTCTTTTCT...


In [26]:
# Define a function to extract the desired pattern
def extract_id_12(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_12'
    else:
        return None

# Apply the function to the 'ids' column
patient_12['ids'] = patient_12['ids'].apply(extract_id_12)



In [27]:
patient_12

Unnamed: 0,ids,seq
0,k127_83430_12,TCCTTTTTCGGTTTGCTTTGGGATATAATAATTTCCTTTTTCAGAA...
1,k127_33374_12,CCGTTCTTAACGCTCACGAGCCTTGTAAGGCTCGTGGCTAAACGGA...
2,k127_13_12,TGTAATGACACCAATGAAATATCATGAGCAATATACAAAAGCAGCA...
3,k127_19_12,GTCGTATATTGACGTCAACGTCCATCCGGCGAAGATGGAGATCCGT...
4,k127_66749_12,GCGTGACTCGGTCGAATCCCCGGAGGAAGTCGAATCCGCGCAGCTG...
...,...,...
8440,k127_83384_12,CATCGTTTAACTCTATTGTCAAAAAGCCATTTGCGTCTAATTCTTT...
8441,k127_83385_12,AATGCACACAGCACCGAGTGCCAAAAGCACCAGCAGAATCAGAACT...
8442,k127_83391_12,ATCTTACGACTGTCAAAGAATCCCAAACCTGCTGCATTGCTTACAT...
8443,k127_83411_12,GTGTTCTTCGTTTCTTTCCTGATGCCATACACGTTCCCTCTTTTCT...


In [28]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_12[patient_12.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [29]:
patient_ids = patient_12['ids'].tolist()
patient_list = patient_12['seq'].tolist()


In [30]:
len(patient_ids)


8445

In [31]:
len(patient_list)

8445

In [32]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_12_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 778.8075883388519 seconds


###  Patient 13

In [33]:
import pandas as pd

file_name = "Patient13.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [34]:
# Convert to DataFrame
patient_13 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [35]:
patient_13

Unnamed: 0,ids,seq
0,k127_94786 flag=1 multi=11.9917 len=13354,ACCGTAGGGAACGCAGGCCGGTTGCCCCGCGGGACGCAGGAAATTC...
1,k127_75842 flag=1 multi=7.0000 len=15693,CGAAGGGCAAACATTATACGAATATCATCCCGATACAAAAGAATGT...
2,k127_18968 flag=1 multi=10.0000 len=12597,ACATGACCCCTTGGAAATGGGATTTGCAGACCGGCCTCCTCTCGTG...
3,k127_75844 flag=1 multi=5.0000 len=3131,GCGTAGTTCTCGAGCTGGTACTGCACATCGGAAAGCTGGCTCTCGA...
4,k127_75846 flag=1 multi=9.0000 len=4601,CAATTTCCTCTTTAAAAGCCCTCTTATTTTAAATAGTGAGAATCAC...
...,...,...
12048,k127_18913 flag=1 multi=8.0000 len=5401,CGGCATTCGCCGCGTGGCCTGCACTTTGCTGCGCAAAGATGCATGG...
12049,k127_18918 flag=0 multi=4.9853 len=2844,GCTTCGTTCTCTAAAGCTTGTTACTTATAATTTTGCAGGATTTACA...
12050,k127_18921 flag=1 multi=8.0000 len=2610,GACCAGCACCGGGTGGGTGAGGCCCGTCATATCCAGCTCGAACAGG...
12051,k127_18925 flag=1 multi=10.0000 len=4422,AAAGATAATATAAAAAGCTCTCCAGGGAATTTCCCTGGGGAGCTTA...


In [36]:
# Define a function to extract the desired pattern
def extract_id_13(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_13'
    else:
        return None

# Apply the function to the 'ids' column
patient_13['ids'] = patient_13['ids'].apply(extract_id_13)



In [37]:
patient_13

Unnamed: 0,ids,seq
0,k127_94786_13,ACCGTAGGGAACGCAGGCCGGTTGCCCCGCGGGACGCAGGAAATTC...
1,k127_75842_13,CGAAGGGCAAACATTATACGAATATCATCCCGATACAAAAGAATGT...
2,k127_18968_13,ACATGACCCCTTGGAAATGGGATTTGCAGACCGGCCTCCTCTCGTG...
3,k127_75844_13,GCGTAGTTCTCGAGCTGGTACTGCACATCGGAAAGCTGGCTCTCGA...
4,k127_75846_13,CAATTTCCTCTTTAAAAGCCCTCTTATTTTAAATAGTGAGAATCAC...
...,...,...
12048,k127_18913_13,CGGCATTCGCCGCGTGGCCTGCACTTTGCTGCGCAAAGATGCATGG...
12049,k127_18918_13,GCTTCGTTCTCTAAAGCTTGTTACTTATAATTTTGCAGGATTTACA...
12050,k127_18921_13,GACCAGCACCGGGTGGGTGAGGCCCGTCATATCCAGCTCGAACAGG...
12051,k127_18925_13,AAAGATAATATAAAAAGCTCTCCAGGGAATTTCCCTGGGGAGCTTA...


In [38]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_13[patient_13.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [39]:
patient_ids = patient_13['ids'].tolist()
patient_list = patient_13['seq'].tolist()


In [40]:
len(patient_ids)


12053

In [41]:
len(patient_list)

12053

In [42]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_13_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1753.0926496982574 seconds


###  Healthy 11

In [43]:
import pandas as pd

file_name = "Healthy11.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [44]:
# Convert to DataFrame
healthy_11 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [45]:
healthy_11

Unnamed: 0,ids,seq
0,k127_108586 flag=1 multi=7.8087 len=3541,GATATCAGCAGTAAGCACTTGTATGGAAAGTTTACTTGGAAAAAAG...
1,k127_21722 flag=0 multi=24.8801 len=2754,TCCTATAATTTTATTATTTTCTTTAGCTATAAACCCAAATGAACTG...
2,k127_130308 flag=1 multi=4.0000 len=4024,GCACTACGGCAACGAAAGTATCGGCAGTAAAATCGACAGCTTCTGG...
3,k127_152028 flag=1 multi=5.0000 len=4291,TATTTTGCGTACACTTGATATTCATATGCAACACCCGATCATGGAC...
4,k127_21727 flag=0 multi=47.0000 len=3185,ATTTTAGCTGTCCCTCCGCACGAGGAAAGTAGGATGGCGGAGAATG...
...,...,...
14507,k127_21680 flag=1 multi=8.0000 len=6233,TGTTATTAAGTTCCGCAATATCGTCATCACAAACAGCAATTTTCAT...
14508,k127_21681 flag=1 multi=5.0000 len=4555,CTCGCAACCTTCATGATGATGTCGCATGATGTACTCCTGTTTTATT...
14509,k127_21700 flag=1 multi=7.0000 len=17799,ACCTATACCCGGCAAGGTCTTGAAATAAACGTAGAAAACTGCTATG...
14510,k127_21706 flag=1 multi=8.0000 len=8721,TCTTGGTCTCTGTTATAATACTTCTAAATATTTTGGTTATCAACAA...


In [46]:
# Define a function to extract the desired pattern
def extract_id_11(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_11'
    else:
        return None

# Apply the function to the 'ids' column
healthy_11['ids'] = healthy_11['ids'].apply(extract_id_11)



In [47]:
healthy_11

Unnamed: 0,ids,seq
0,k127_108586_11,GATATCAGCAGTAAGCACTTGTATGGAAAGTTTACTTGGAAAAAAG...
1,k127_21722_11,TCCTATAATTTTATTATTTTCTTTAGCTATAAACCCAAATGAACTG...
2,k127_130308_11,GCACTACGGCAACGAAAGTATCGGCAGTAAAATCGACAGCTTCTGG...
3,k127_152028_11,TATTTTGCGTACACTTGATATTCATATGCAACACCCGATCATGGAC...
4,k127_21727_11,ATTTTAGCTGTCCCTCCGCACGAGGAAAGTAGGATGGCGGAGAATG...
...,...,...
14507,k127_21680_11,TGTTATTAAGTTCCGCAATATCGTCATCACAAACAGCAATTTTCAT...
14508,k127_21681_11,CTCGCAACCTTCATGATGATGTCGCATGATGTACTCCTGTTTTATT...
14509,k127_21700_11,ACCTATACCCGGCAAGGTCTTGAAATAAACGTAGAAAACTGCTATG...
14510,k127_21706_11,TCTTGGTCTCTGTTATAATACTTCTAAATATTTTGGTTATCAACAA...


In [48]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_11[healthy_11.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [49]:
healthy_ids = healthy_11['ids'].tolist()
healthy_list = healthy_11['seq'].tolist()


In [50]:
len(healthy_ids)


14512

In [51]:
len(healthy_list)

14512

In [52]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_11_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 2226.3298904895782 seconds


###  Healthy 12

In [53]:
import pandas as pd

file_name = "Healthy12.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [54]:
# Convert to DataFrame
healthy_12 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [55]:
healthy_12

Unnamed: 0,ids,seq
0,k127_47088 flag=1 multi=6.0000 len=2549,CCATATTAGTAGTAACAAAGATCCTGCCCATTGATGTCAGGACATC...
1,k127_0 flag=0 multi=29.7077 len=3103,GTTATCCAGCAATCAAAGCTGCCTTTTTCTTGTAGTTTGCATATAA...
2,k127_13454 flag=1 multi=5.0000 len=4358,GTCTTTCAACGTACAAATCTGGAACACATTAGGATTATCCCAACCT...
3,k127_26911 flag=0 multi=20.9616 len=5337,TCCTACTTGAAGGGCGAGTTCCTGATATGAAAATTCGCTTGTTAAG...
4,k127_20189 flag=1 multi=7.0000 len=2639,TCCCTGTCCCGCCCCGATCTGGAATCCAGAACAGGCGCAGAGAGCA...
...,...,...
7157,k127_6677 flag=0 multi=25.9978 len=111918,CTTTGTGTGGCCTGCACCTTTGCCGTTCTTCCCGTTTGCGCCACAC...
7158,k127_6683 flag=0 multi=25.9831 len=29633,ATAAAAAGCCGTTCCTCCCGGCATATAGACAGTGTTTTGGGGAGTG...
7159,k127_6697 flag=1 multi=8.0000 len=16805,GGCTCCACCTGGAGGGACGGCCGGGTGCGGTCGGCCTCCAGAGTGG...
7160,k127_6711 flag=0 multi=9.9879 len=15262,GCTTGGATTTGCTGTAACCAGCTTGGTCGTCGAAATTTGACACCAC...


In [56]:
# Define a function to extract the desired pattern
def extract_id_12(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_12'
    else:
        return None

# Apply the function to the 'ids' column
healthy_12['ids'] = healthy_12['ids'].apply(extract_id_12)



In [57]:
healthy_12

Unnamed: 0,ids,seq
0,k127_47088_12,CCATATTAGTAGTAACAAAGATCCTGCCCATTGATGTCAGGACATC...
1,k127_0_12,GTTATCCAGCAATCAAAGCTGCCTTTTTCTTGTAGTTTGCATATAA...
2,k127_13454_12,GTCTTTCAACGTACAAATCTGGAACACATTAGGATTATCCCAACCT...
3,k127_26911_12,TCCTACTTGAAGGGCGAGTTCCTGATATGAAAATTCGCTTGTTAAG...
4,k127_20189_12,TCCCTGTCCCGCCCCGATCTGGAATCCAGAACAGGCGCAGAGAGCA...
...,...,...
7157,k127_6677_12,CTTTGTGTGGCCTGCACCTTTGCCGTTCTTCCCGTTTGCGCCACAC...
7158,k127_6683_12,ATAAAAAGCCGTTCCTCCCGGCATATAGACAGTGTTTTGGGGAGTG...
7159,k127_6697_12,GGCTCCACCTGGAGGGACGGCCGGGTGCGGTCGGCCTCCAGAGTGG...
7160,k127_6711_12,GCTTGGATTTGCTGTAACCAGCTTGGTCGTCGAAATTTGACACCAC...


In [58]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_12[healthy_12.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [59]:
healthy_ids = healthy_12['ids'].tolist()
healthy_list = healthy_12['seq'].tolist()


In [60]:
len(healthy_ids)


7162

In [61]:
len(healthy_list)

7162

In [62]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_12_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 854.9261989593506 seconds


###  Healthy 13

In [63]:
import pandas as pd

file_name = "Healthy13.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [64]:
# Convert to DataFrame
healthy_13 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [65]:
healthy_13

Unnamed: 0,ids,seq
0,k127_30735 flag=1 multi=5.0000 len=3759,CCTTAGAGAGAAATACAAATTTTGGACTGTATTTTTTGCGTGATAA...
1,k127_3 flag=0 multi=6.8107 len=3899,AAAGTAATCCGTATCATCTGCATTCCTTTTTAAGCAAGATCTGCTT...
2,k127_92205 flag=1 multi=6.0000 len=5165,AGGCGTGCCTCTTCAACCACTCGAGCACCTCTCCTTATGAGTTTTC...
3,k127_107570 flag=1 multi=6.0000 len=10687,ATGCGATATGCTCCAAAGTCAGGCCCTGCGGCGCAATCGGCCCGCT...
4,k127_30738 flag=0 multi=11.9817 len=3727,GCCGCGCGCAAGTTTTACCCGCCTTGGGCGGGCAAAACTCACACTT...
...,...,...
12266,k127_15339 flag=1 multi=5.0000 len=6628,GGTACGCATCACGTCCGGTTATTTTACCGGGATGGAGGCCGAGGTG...
12267,k127_15350 flag=0 multi=24.9607 len=12331,CGGTCAACGCAGCCGTGGAGCTGGCCTGCGCCAAGAAGATCATCAC...
12268,k127_15351 flag=0 multi=7.9801 len=3640,CCTTGGAAAGGTGATTGTGCTATAAGCACCTGCCCACCACCCGCAT...
12269,k127_15355 flag=1 multi=5.0000 len=4734,GTCACGAACATGCCGCGCGTGAGTGCGTCGTCCGGGGCGAAGAGCC...


In [66]:
# Define a function to extract the desired pattern
def extract_id_13(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_13'
    else:
        return None

# Apply the function to the 'ids' column
healthy_13['ids'] = healthy_13['ids'].apply(extract_id_13)



In [67]:
healthy_13

Unnamed: 0,ids,seq
0,k127_30735_13,CCTTAGAGAGAAATACAAATTTTGGACTGTATTTTTTGCGTGATAA...
1,k127_3_13,AAAGTAATCCGTATCATCTGCATTCCTTTTTAAGCAAGATCTGCTT...
2,k127_92205_13,AGGCGTGCCTCTTCAACCACTCGAGCACCTCTCCTTATGAGTTTTC...
3,k127_107570_13,ATGCGATATGCTCCAAAGTCAGGCCCTGCGGCGCAATCGGCCCGCT...
4,k127_30738_13,GCCGCGCGCAAGTTTTACCCGCCTTGGGCGGGCAAAACTCACACTT...
...,...,...
12266,k127_15339_13,GGTACGCATCACGTCCGGTTATTTTACCGGGATGGAGGCCGAGGTG...
12267,k127_15350_13,CGGTCAACGCAGCCGTGGAGCTGGCCTGCGCCAAGAAGATCATCAC...
12268,k127_15351_13,CCTTGGAAAGGTGATTGTGCTATAAGCACCTGCCCACCACCCGCAT...
12269,k127_15355_13,GTCACGAACATGCCGCGCGTGAGTGCGTCGTCCGGGGCGAAGAGCC...


In [68]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_13[healthy_13.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [69]:
healthy_ids = healthy_13['ids'].tolist()
healthy_list = healthy_13['seq'].tolist()


In [70]:
len(healthy_ids)


12271

In [71]:
len(healthy_list)

12271

In [72]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_13_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1398.8144624233246 seconds
