In [1]:
import pandas as pd
import os
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import load_model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc




In [2]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [3]:
def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = sequence[::-1]  # Reverse the sequence
    reverse_complement_seq = ''.join(complement[base] for base in reverse_seq)
    return reverse_complement_seq

def reverse_complement_list(sequence_list):
    reverse_complements = []
    for seq in sequence_list:
        reverse_complements.append(reverse_complement(seq))
    return reverse_complements

In [4]:
#This function gets the cgr for the sequences
def easy_cgr(dna_sequence):
  nucleotide_map = {
    "A": (1, 1),
    "T": (-1, -1),
    "C": (1, -1),
    "G": (-1, 1),
}
  values=[]
  point=(0,0)
  for nucleotide in dna_sequence:
        next_point=nucleotide_map[nucleotide]
        point=((point[0]+next_point[0])/2,(point[1]+next_point[1])/2)
        values.append(point)
  return values

In [5]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()

In [6]:
#This function reads the file when usingantibiotic resistance genes
import re

def read_antibiotic_resistance_genes(fasta_file_path):
    # Initialize lists to store sequence data
    sequence_ids = []
    sequences = []
    terms_inside_brackets = []  # Initialize a list to store terms inside brackets

    # Define a regular expression pattern to match text inside square brackets
    pattern = r'\[([^]]+)\]'

    # Open the FASTA file for reading
    with open(fasta_file_path, "r") as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace

            # Check if the line is an ID line (starts with '>')
            if line.startswith(">"):
                # If it's not the first sequence, process the previous one
                if sequence_ids:
                    term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
                    terms_inside_brackets.append(term_inside_brackets)  # Append to the list
                # Extract the sequence ID from the ID line
                sequence_ids.append(line[1:])
                sequences.append("")  # Reset the sequence
            else:
                # Append the line to the sequence
                sequences[-1] += line

    # Process the last sequence in the file
    if sequence_ids:
        term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
        terms_inside_brackets.append(term_inside_brackets)

    return sequences, terms_inside_brackets

In [7]:
from Bio import SeqIO

def read_fasta_patients(file_path):
    sequence_ids = []
    sequences = []
    
    try:
        with open(file_path, "r") as fasta_file:
            fasta_sequences = SeqIO.parse(fasta_file, "fasta")
            
            for seq_record in fasta_sequences:
                sequence_ids.append(seq_record.id)
                sequences.append(str(seq_record.seq))
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
    
    return sequences, sequence_ids


In [8]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()


In [9]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(84 / dpi, 84 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images(sequences, ids, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    

    for i, (sequence, label) in enumerate(zip(sequences, ids)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{i}_{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

In [11]:

def read_fasta_ARG(file_name):
    sequences = []
    with open(file_name, 'r') as file:
        sequence_id = ''
        sequence = ''
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if sequence_id != '':
                    sequences.append(sequence_id.split('|') + [sequence])
                sequence_id = line[1:]
                sequence = ''
            else:
                sequence += line
        if sequence_id != '':
            sequences.append(sequence_id.split('|') + [sequence])
    return sequences

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(100 / dpi, 100 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images_label(sequences, class_labels, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i, (sequence, label) in enumerate(zip(sequences, class_labels)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

###  Patient 1

In [13]:
import pandas as pd

file_name = "Patient1.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [14]:
# Convert to DataFrame
patient_1 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [15]:
patient_1

Unnamed: 0,ids,seq
0,k127_137511 flag=1 multi=6.0000 len=2500,AGATCGCAGTCATCACCGATCGGGACAACTGCGTGGCCGTCACCGG...
1,k127_34380 flag=1 multi=10.0000 len=3007,ACTTTGCGGCGAAGGTGCCCGCACGGCGCATCAAAGTGTCTAGGCC...
2,k127_133694 flag=1 multi=9.0000 len=2536,CGGCACGCTGACAGAAATCCAGAAGCATGACATACTGGAGCTGATT...
3,k127_145156 flag=1 multi=50.9495 len=3571,TTGAGTTTTTAAATTTATAAATTTAAAGTTTTTCTATTATGAACAA...
4,k127_95506 flag=1 multi=6.0000 len=3937,TCAGAAAGTTTCTCCTCCCCCTCCCCAATGCTTCAATTATTTACCG...
...,...,...
10976,k127_53427 flag=0 multi=91.9979 len=442537,ATAAGTATGACGGCATTGAGCGTATGCATCAGGATTATGAAAGAGA...
10977,k127_26723 flag=1 multi=10.1277 len=8637,GGTTTACTAAGATATCAAATGTGTCTGCCCAGGCAATAAGTTCCTC...
10978,k127_53473 flag=1 multi=6.0000 len=3090,TGCCAATATCATGCAGCACCGCCGCTACGTAGATCAGTTCCTTATC...
10979,k127_26733 flag=1 multi=16.9993 len=49484,GAGGTGTTTTGGCATGAATATGCCAAAATCCCGAGACATACAAGCC...


In [16]:
# Define a function to extract the desired pattern
def extract_id_1(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_1'
    else:
        return None

# Apply the function to the 'ids' column
patient_1['ids'] = patient_1['ids'].apply(extract_id_1)



In [17]:
patient_1

Unnamed: 0,ids,seq
0,k127_137511_1,AGATCGCAGTCATCACCGATCGGGACAACTGCGTGGCCGTCACCGG...
1,k127_34380_1,ACTTTGCGGCGAAGGTGCCCGCACGGCGCATCAAAGTGTCTAGGCC...
2,k127_133694_1,CGGCACGCTGACAGAAATCCAGAAGCATGACATACTGGAGCTGATT...
3,k127_145156_1,TTGAGTTTTTAAATTTATAAATTTAAAGTTTTTCTATTATGAACAA...
4,k127_95506_1,TCAGAAAGTTTCTCCTCCCCCTCCCCAATGCTTCAATTATTTACCG...
...,...,...
10976,k127_53427_1,ATAAGTATGACGGCATTGAGCGTATGCATCAGGATTATGAAAGAGA...
10977,k127_26723_1,GGTTTACTAAGATATCAAATGTGTCTGCCCAGGCAATAAGTTCCTC...
10978,k127_53473_1,TGCCAATATCATGCAGCACCGCCGCTACGTAGATCAGTTCCTTATC...
10979,k127_26733_1,GAGGTGTTTTGGCATGAATATGCCAAAATCCCGAGACATACAAGCC...


In [18]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_1[patient_1.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [19]:
patient_ids = patient_1['ids'].tolist()
patient_list = patient_1['seq'].tolist()


In [20]:
len(patient_ids)


10981

In [21]:
len(patient_list)

10981

In [22]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_1_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 789.2033243179321 seconds


###  Patient 2

In [23]:
import pandas as pd

file_name = "Patient2.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [24]:
# Convert to DataFrame
patient_2 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [25]:
patient_2

Unnamed: 0,ids,seq
0,k127_276195 flag=1 multi=6.0000 len=2555,GTAGGGAACCAGATCAAATCCACAACTTCTCGATCAGGAGAATCAT...
1,k127_165721 flag=0 multi=17.6825 len=2817,GCCTCGTCGGAGTGGGCCATGTAGTTGATGCCCAGACAGATGACGT...
2,k127_276197 flag=1 multi=5.0000 len=3067,CCACATGACATGGCGCATAATGTAATGTGGTTGCACACACCTCCAC...
3,k127_138108 flag=1 multi=10.0000 len=2946,TTTCTATGGTACGGTATATGGTACCGGGAATGTATTGCTCTCTGGA...
4,k127_193343 flag=1 multi=10.8173 len=4671,GCAGTAGTTCATGTAGAACCCGATCAGCTCCCGGGAAGCCCAGCCG...
...,...,...
21706,k127_27537 flag=1 multi=12.0000 len=7006,CCGCTTCTCTATAATATAAAAATGAATGCCTATAACCCGCAGAGAG...
21707,k127_27551 flag=0 multi=5.9799 len=3117,TCTTTGAAGGTTACGCAACGCGCAGAAGGATAATGCAGTAGATAAC...
21708,k127_27563 flag=0 multi=14.3318 len=5034,CCTTGCGGCGCACGCCCAGACGGCTCACCAGCTCACCGCCGCCCTT...
21709,k127_27575 flag=1 multi=8.0000 len=2562,AATAATAGCAGGGAAATAGCAACTATTAACAAAGTTATCAGTATAG...


In [26]:
# Define a function to extract the desired pattern
def extract_id_2(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_2'
    else:
        return None

# Apply the function to the 'ids' column
patient_2['ids'] = patient_2['ids'].apply(extract_id_2)



In [27]:
patient_2

Unnamed: 0,ids,seq
0,k127_276195_2,GTAGGGAACCAGATCAAATCCACAACTTCTCGATCAGGAGAATCAT...
1,k127_165721_2,GCCTCGTCGGAGTGGGCCATGTAGTTGATGCCCAGACAGATGACGT...
2,k127_276197_2,CCACATGACATGGCGCATAATGTAATGTGGTTGCACACACCTCCAC...
3,k127_138108_2,TTTCTATGGTACGGTATATGGTACCGGGAATGTATTGCTCTCTGGA...
4,k127_193343_2,GCAGTAGTTCATGTAGAACCCGATCAGCTCCCGGGAAGCCCAGCCG...
...,...,...
21706,k127_27537_2,CCGCTTCTCTATAATATAAAAATGAATGCCTATAACCCGCAGAGAG...
21707,k127_27551_2,TCTTTGAAGGTTACGCAACGCGCAGAAGGATAATGCAGTAGATAAC...
21708,k127_27563_2,CCTTGCGGCGCACGCCCAGACGGCTCACCAGCTCACCGCCGCCCTT...
21709,k127_27575_2,AATAATAGCAGGGAAATAGCAACTATTAACAAAGTTATCAGTATAG...


In [28]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_2[patient_2.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [29]:
patient_ids = patient_2['ids'].tolist()[:15000]
patient_list = patient_2['seq'].tolist()[:15000]



In [30]:
len(patient_ids)


15000

In [31]:
len(patient_list)

15000

In [32]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_2_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1561.3771016597748 seconds


###  Patient 4

In [33]:
import pandas as pd

file_name = "Patient4.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [34]:
# Convert to DataFrame
patient_4 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [35]:
patient_4

Unnamed: 0,ids,seq
0,k127_92563 flag=1 multi=6.0000 len=3073,CACCTGGTCCAGATACACCCACATTAATTACACAATCGGCTTCACC...
1,k127_102849 flag=0 multi=11.7480 len=3671,CCATCGTTCCTTCCGTAGCAGAAAAGAAAGCTCCTGGCTCCAACGT...
2,k127_51425 flag=1 multi=6.9888 len=2634,GGGCATCATTGCTGGTAAGAGCAACACCAAGTTCGTTCCGAACGAA...
3,k127_102856 flag=1 multi=4.0000 len=2875,GTGCAAAGGTACAAATCATTTTTAATTCTGCAAACTTCCTTTCTCT...
4,k127_113139 flag=0 multi=6.9879 len=5104,TCCAAACAGAAGGCGTTTGCGTTATTTCTTCTGAATTTTCAACCGC...
...,...,...
9167,k127_102771 flag=1 multi=6.0000 len=2617,CTATCATACCCTGCATGATATACAAACAAAATACTTTTCACAAACA...
9168,k127_61701 flag=1 multi=78.0000 len=74253,GTGTATAATACATTGTCCCAATATTATTTTTTTTTAGATAAAGAAA...
9169,k127_102786 flag=0 multi=45.9470 len=40013,CGAAATGGTATGGAAAAGAGATAATAAAAATTGATAAATTTTATCC...
9170,k127_102798 flag=1 multi=9.0000 len=13154,ACTAATTAAATAATACAATATTTTTGAAAAAATAGCAAGTTTTTAG...


In [36]:
# Define a function to extract the desired pattern
def extract_id_4(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_4'
    else:
        return None

# Apply the function to the 'ids' column
patient_4['ids'] = patient_4['ids'].apply(extract_id_4)



In [37]:
patient_4

Unnamed: 0,ids,seq
0,k127_92563_4,CACCTGGTCCAGATACACCCACATTAATTACACAATCGGCTTCACC...
1,k127_102849_4,CCATCGTTCCTTCCGTAGCAGAAAAGAAAGCTCCTGGCTCCAACGT...
2,k127_51425_4,GGGCATCATTGCTGGTAAGAGCAACACCAAGTTCGTTCCGAACGAA...
3,k127_102856_4,GTGCAAAGGTACAAATCATTTTTAATTCTGCAAACTTCCTTTCTCT...
4,k127_113139_4,TCCAAACAGAAGGCGTTTGCGTTATTTCTTCTGAATTTTCAACCGC...
...,...,...
9167,k127_102771_4,CTATCATACCCTGCATGATATACAAACAAAATACTTTTCACAAACA...
9168,k127_61701_4,GTGTATAATACATTGTCCCAATATTATTTTTTTTTAGATAAAGAAA...
9169,k127_102786_4,CGAAATGGTATGGAAAAGAGATAATAAAAATTGATAAATTTTATCC...
9170,k127_102798_4,ACTAATTAAATAATACAATATTTTTGAAAAAATAGCAAGTTTTTAG...


In [38]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_4[patient_4.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [39]:
patient_ids = patient_4['ids'].tolist()
patient_list = patient_4['seq'].tolist()


In [40]:
len(patient_ids)


9172

In [41]:
len(patient_list)

9172

In [42]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_4_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 956.8253862857819 seconds


###  Patient 5

In [43]:
import pandas as pd

file_name = "Patient5.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [44]:
# Convert to DataFrame
patient_5 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [45]:
patient_5

Unnamed: 0,ids,seq
0,k127_137157 flag=0 multi=33.0000 len=4132,GTTTATAATGGCTAAGCGGGGAACGCTGTAAGCGTTATCCCGGTTT...
1,k127_201702 flag=1 multi=4.0000 len=2824,GCATTGCAAAGACCAAAGAAGAAGCAAGAGAACGCTGCATGAGTAT...
2,k127_145225 flag=0 multi=17.5053 len=4829,CACTCCGTTTACAGTAACACGCTTCGCGATGCACAGAAATCATGCT...
3,k127_185568 flag=1 multi=8.0000 len=2762,GTTCTGCGCGGTCTTGGTCGCGATGACCCTCACACGCTTTTTAGCG...
4,k127_225909 flag=1 multi=7.0000 len=2983,CGGCCAGCTTCCGGTTCAATTCCTGCTTGATCTCGCTGGCGATCTG...
...,...,...
25249,k127_177427 flag=1 multi=9.0000 len=2665,GACTAGATATAAAGTTTACGATATAATCACATTTATTAGAGGAGGA...
25250,k127_177445 flag=1 multi=10.0000 len=4746,CTTATAAATAAAGGGTGGCAGAAGAGATTTTCAAATGATTCTGCAC...
25251,k127_177453 flag=0 multi=50.9933 len=74422,CTCCTACTCCTACTCCTCTACCCTCCCTTCCGGACTTCTCCGACAC...
25252,k127_177464 flag=1 multi=11.0000 len=12795,TTAAGGTGATACTGATTATAATTAGAAAGATTGATTTTTAGATACT...


In [46]:
# Define a function to extract the desired pattern
def extract_id_5(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_5'
    else:
        return None

# Apply the function to the 'ids' column
patient_5['ids'] = patient_5['ids'].apply(extract_id_5)



In [47]:
patient_5

Unnamed: 0,ids,seq
0,k127_137157_5,GTTTATAATGGCTAAGCGGGGAACGCTGTAAGCGTTATCCCGGTTT...
1,k127_201702_5,GCATTGCAAAGACCAAAGAAGAAGCAAGAGAACGCTGCATGAGTAT...
2,k127_145225_5,CACTCCGTTTACAGTAACACGCTTCGCGATGCACAGAAATCATGCT...
3,k127_185568_5,GTTCTGCGCGGTCTTGGTCGCGATGACCCTCACACGCTTTTTAGCG...
4,k127_225909_5,CGGCCAGCTTCCGGTTCAATTCCTGCTTGATCTCGCTGGCGATCTG...
...,...,...
25249,k127_177427_5,GACTAGATATAAAGTTTACGATATAATCACATTTATTAGAGGAGGA...
25250,k127_177445_5,CTTATAAATAAAGGGTGGCAGAAGAGATTTTCAAATGATTCTGCAC...
25251,k127_177453_5,CTCCTACTCCTACTCCTCTACCCTCCCTTCCGGACTTCTCCGACAC...
25252,k127_177464_5,TTAAGGTGATACTGATTATAATTAGAAAGATTGATTTTTAGATACT...


In [48]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_5[patient_5.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [49]:
patient_ids = patient_5['ids'].tolist()[:15000]
patient_list = patient_5['seq'].tolist()[:15000]



In [50]:
len(patient_ids)


15000

In [51]:
len(patient_list)

15000

In [52]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_5_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 2124.975835084915 seconds


###  Patient 6

In [53]:
import pandas as pd

file_name = "Patient6.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [54]:
# Convert to DataFrame
patient_6 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [55]:
patient_6

Unnamed: 0,ids,seq
0,k127_13742 flag=1 multi=9.0000 len=2833,TTACCATACTAGTCTAGTTCTAATGATTATTACACTAGTTTATTGA...
1,k127_6873 flag=1 multi=5.0000 len=2721,GATATACAGTTCCTTCTGCTCCGGACAGTAGAGCGTCGCGAACACC...
2,k127_73262 flag=1 multi=5.0000 len=2941,GCTGTAATCGCTGCTTTGAAGGCAGTGCCGAATTCTTTTTATTATA...
3,k127_9161 flag=0 multi=28.0000 len=3883,GATGTTACTTTTGTTACTGCTACTGATGGTAACCACGGTAGAGGTG...
4,k127_20612 flag=1 multi=11.0000 len=3881,GTATAAGGTTCTCCCTCATGTATATATAACTCATCTATACCCCCTT...
...,...,...
10262,k127_57202 flag=0 multi=11.4622 len=3288,GAGTTACCTGGCTTATGGGATGTCTTTATTGGAAAGATGAGTTTTG...
10263,k127_57212 flag=1 multi=5.0000 len=2774,GTATATTGACTCATTTCAATACATGGGTTATAATATACAAAAATCT...
10264,k127_57213 flag=0 multi=7.0344 len=2886,GAAGGAACTGTTATTAAGAGACAGAACGGTGGAGCAAAAGAAACTT...
10265,k127_57219 flag=0 multi=18.3776 len=4986,TGTATGGACAATCTACAGAAACGGCTATCCGGGCAGGAGTCATTCG...


In [56]:
# Define a function to extract the desired pattern
def extract_id_6(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_6'
    else:
        return None

# Apply the function to the 'ids' column
patient_6['ids'] = patient_6['ids'].apply(extract_id_6)



In [57]:
patient_6

Unnamed: 0,ids,seq
0,k127_13742_6,TTACCATACTAGTCTAGTTCTAATGATTATTACACTAGTTTATTGA...
1,k127_6873_6,GATATACAGTTCCTTCTGCTCCGGACAGTAGAGCGTCGCGAACACC...
2,k127_73262_6,GCTGTAATCGCTGCTTTGAAGGCAGTGCCGAATTCTTTTTATTATA...
3,k127_9161_6,GATGTTACTTTTGTTACTGCTACTGATGGTAACCACGGTAGAGGTG...
4,k127_20612_6,GTATAAGGTTCTCCCTCATGTATATATAACTCATCTATACCCCCTT...
...,...,...
10262,k127_57202_6,GAGTTACCTGGCTTATGGGATGTCTTTATTGGAAAGATGAGTTTTG...
10263,k127_57212_6,GTATATTGACTCATTTCAATACATGGGTTATAATATACAAAAATCT...
10264,k127_57213_6,GAAGGAACTGTTATTAAGAGACAGAACGGTGGAGCAAAAGAAACTT...
10265,k127_57219_6,TGTATGGACAATCTACAGAAACGGCTATCCGGGCAGGAGTCATTCG...


In [58]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_6[patient_6.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [59]:
patient_ids = patient_6['ids'].tolist()
patient_list = patient_6['seq'].tolist()


In [60]:
len(patient_ids)


10267

In [61]:
len(patient_list)

10267

In [62]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_6_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1397.5441434383392 seconds


###  Healthy 14

In [63]:
import pandas as pd

file_name = "Healthy14.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [64]:
# Convert to DataFrame
healthy_14 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [65]:
healthy_14

Unnamed: 0,ids,seq
0,k127_26544 flag=0 multi=8.6856 len=4414,GGATATGGTCATATATGGACAGGACATCACAAGCAAAATTCTGAAC...
1,k127_66360 flag=1 multi=4.0000 len=2985,AAATCCCAACTTATCATGTCTATGACCGCAAGACCATGTCAGGTGG...
2,k127_92900 flag=1 multi=6.0000 len=4968,GCCTTTTTTATTTGATTAAATTTGGTTATTTACCACGAAATCACAT...
3,k127_53088 flag=1 multi=5.0000 len=4849,GTTGTATGCGGAGATGTCAGACAGAGAGTCGCAGCTTTCCGAGGAA...
4,k127_39823 flag=0 multi=4.9892 len=3847,GATCGCTTTTCTTTTCCTCACCCATGATGACACGCATCATATCGAG...
...,...,...
10936,k127_13271 flag=1 multi=5.0000 len=2751,GGCTGAAAGACCGGAATGCCGTGCGCGAGGGCAAGCTCCTTGACCG...
10937,k127_26526 flag=1 multi=4.0000 len=3180,GCTTCATGAGGTCGAATGCGGAATTACCGGTTTCGGCAATTTCGGC...
10938,k127_26532 flag=1 multi=5.9875 len=4126,TGCAGCATCAGCATCTGAATCAGCAGCGTTACAATATTTAGCGCCT...
10939,k127_26534 flag=1 multi=91.9340 len=27697,ATCAAAGTAGTCTAAAGTGCCTTCAGGAAGAAGAAGGCGATAACCG...


In [66]:
# Define a function to extract the desired pattern
def extract_id_14(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_14'
    else:
        return None

# Apply the function to the 'ids' column
healthy_14['ids'] = healthy_14['ids'].apply(extract_id_14)



In [67]:
healthy_14

Unnamed: 0,ids,seq
0,k127_26544_14,GGATATGGTCATATATGGACAGGACATCACAAGCAAAATTCTGAAC...
1,k127_66360_14,AAATCCCAACTTATCATGTCTATGACCGCAAGACCATGTCAGGTGG...
2,k127_92900_14,GCCTTTTTTATTTGATTAAATTTGGTTATTTACCACGAAATCACAT...
3,k127_53088_14,GTTGTATGCGGAGATGTCAGACAGAGAGTCGCAGCTTTCCGAGGAA...
4,k127_39823_14,GATCGCTTTTCTTTTCCTCACCCATGATGACACGCATCATATCGAG...
...,...,...
10936,k127_13271_14,GGCTGAAAGACCGGAATGCCGTGCGCGAGGGCAAGCTCCTTGACCG...
10937,k127_26526_14,GCTTCATGAGGTCGAATGCGGAATTACCGGTTTCGGCAATTTCGGC...
10938,k127_26532_14,TGCAGCATCAGCATCTGAATCAGCAGCGTTACAATATTTAGCGCCT...
10939,k127_26534_14,ATCAAAGTAGTCTAAAGTGCCTTCAGGAAGAAGAAGGCGATAACCG...


In [68]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_14[healthy_14.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [69]:
healthy_ids = healthy_14['ids'].tolist()
healthy_list = healthy_14['seq'].tolist()


In [70]:
len(healthy_ids)


10941

In [71]:
len(healthy_list)

10941

In [72]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_14_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1291.4193758964539 seconds


###  Healthy 15

In [73]:
import pandas as pd

file_name = "Healthy15.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [74]:
# Convert to DataFrame
healthy_15 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [75]:
healthy_15

Unnamed: 0,ids,seq
0,k127_60849 flag=1 multi=5.0000 len=3488,TTTCTAAATAAGCTTCGTCTCCTCCAACTTCTCTGTAAACTCATCA...
1,k127_85185 flag=0 multi=32.6564 len=7298,AAGAAACAATATATAACGAAAAAATCTCTAAATAAGAGAATTTACA...
2,k127_73020 flag=1 multi=8.0000 len=3888,CAATATGAACAGGTAAAAACGGCCTACGAAGCAACCCAAGCACGCT...
3,k127_12172 flag=0 multi=54.8638 len=8057,CACAGTAACGTAGCCAAAATTCATTCCAGATTGCCTGCGGCAATGG...
4,k127_36522 flag=0 multi=7.9459 len=2842,TCTGGCAGGAATATCTCGATTTGGTCGAACATTTGAGAAAAACCGA...
...,...,...
10736,k127_36447 flag=1 multi=8.0000 len=26058,CCCCCAATCCAGCTAATACAATACTTTGACATCTCATCCGTGACCT...
10737,k127_36474 flag=1 multi=9.0000 len=14046,TTTTATCGTAACAAACCCGCCATACGGAGAGCGTTTAGAGGATAAG...
10738,k127_36481 flag=0 multi=11.0923 len=3930,CTTTGATTTTGAGTTTGGCTTGCGACACCATTCTCATCATATCACA...
10739,k127_36491 flag=1 multi=15.9921 len=32530,TCCTAATCATCATTATAATTCTCAGACCAGCACTGGTGTGGTGGTC...


In [76]:
# Define a function to extract the desired pattern
def extract_id_15(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_15'
    else:
        return None

# Apply the function to the 'ids' column
healthy_15['ids'] = healthy_15['ids'].apply(extract_id_15)



In [77]:
healthy_15

Unnamed: 0,ids,seq
0,k127_60849_15,TTTCTAAATAAGCTTCGTCTCCTCCAACTTCTCTGTAAACTCATCA...
1,k127_85185_15,AAGAAACAATATATAACGAAAAAATCTCTAAATAAGAGAATTTACA...
2,k127_73020_15,CAATATGAACAGGTAAAAACGGCCTACGAAGCAACCCAAGCACGCT...
3,k127_12172_15,CACAGTAACGTAGCCAAAATTCATTCCAGATTGCCTGCGGCAATGG...
4,k127_36522_15,TCTGGCAGGAATATCTCGATTTGGTCGAACATTTGAGAAAAACCGA...
...,...,...
10736,k127_36447_15,CCCCCAATCCAGCTAATACAATACTTTGACATCTCATCCGTGACCT...
10737,k127_36474_15,TTTTATCGTAACAAACCCGCCATACGGAGAGCGTTTAGAGGATAAG...
10738,k127_36481_15,CTTTGATTTTGAGTTTGGCTTGCGACACCATTCTCATCATATCACA...
10739,k127_36491_15,TCCTAATCATCATTATAATTCTCAGACCAGCACTGGTGTGGTGGTC...


In [78]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_15[healthy_15.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [79]:
healthy_ids = healthy_15['ids'].tolist()
healthy_list = healthy_15['seq'].tolist()


In [80]:
len(healthy_ids)


10741

In [81]:
len(healthy_list)

10741

In [82]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_15_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1624.939656496048 seconds


###  Healthy 16

In [83]:
import pandas as pd

file_name = "Healthy16.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [84]:
# Convert to DataFrame
healthy_16 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [85]:
healthy_16

Unnamed: 0,ids,seq
0,k127_6161 flag=0 multi=6.7088 len=4014,TTTTGTTTTACACGGCAGTTTCCCTTTAGCAGTAACCGCGGATAAG...
1,k127_12327 flag=1 multi=6.0000 len=3523,ATGGAAGACTCTTATATTGATTATGCAATGAGCGTTATTGCCTCCC...
2,k127_30808 flag=1 multi=4.0000 len=2985,TCTCCAGATAGCGGTAGGGATTGTAGGCCGAGGGGTTGTTGGTGAT...
3,k127_43132 flag=1 multi=5.0000 len=2761,ACGGCGAGCAATAATTTTTAGCGCAGCAATATTATGCGTTTTACGC...
4,k127_36974 flag=1 multi=8.0000 len=4585,GGGTAAAGCAGTCGGTTCGTCATTCAATTCGAAAGCAGCTTCGGCG...
...,...,...
4960,k127_30800 flag=1 multi=6.0000 len=8008,AGACCAGATCGTTTTCCGTTAAATCAACGAATTCCGTATGACCATC...
4961,k127_30803 flag=1 multi=7.0000 len=2929,GGAGTTTCAATAAAATATTTTGACATTTTTTTATACTATTTTATCC...
4962,k127_6111 flag=0 multi=11.9410 len=23984,TATGCAAGTGACATTTTCTGACTCGCCTTCGGCAAAGCCTTTACCG...
4963,k127_6140 flag=0 multi=18.8130 len=7920,AAGCTCGGTGTCAGGGATGCGATCTCATCCATGATCTGAGCAGAAG...


In [86]:
# Define a function to extract the desired pattern
def extract_id_16(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_16'
    else:
        return None

# Apply the function to the 'ids' column
healthy_16['ids'] = healthy_16['ids'].apply(extract_id_16)



In [87]:
healthy_16

Unnamed: 0,ids,seq
0,k127_6161_16,TTTTGTTTTACACGGCAGTTTCCCTTTAGCAGTAACCGCGGATAAG...
1,k127_12327_16,ATGGAAGACTCTTATATTGATTATGCAATGAGCGTTATTGCCTCCC...
2,k127_30808_16,TCTCCAGATAGCGGTAGGGATTGTAGGCCGAGGGGTTGTTGGTGAT...
3,k127_43132_16,ACGGCGAGCAATAATTTTTAGCGCAGCAATATTATGCGTTTTACGC...
4,k127_36974_16,GGGTAAAGCAGTCGGTTCGTCATTCAATTCGAAAGCAGCTTCGGCG...
...,...,...
4960,k127_30800_16,AGACCAGATCGTTTTCCGTTAAATCAACGAATTCCGTATGACCATC...
4961,k127_30803_16,GGAGTTTCAATAAAATATTTTGACATTTTTTTATACTATTTTATCC...
4962,k127_6111_16,TATGCAAGTGACATTTTCTGACTCGCCTTCGGCAAAGCCTTTACCG...
4963,k127_6140_16,AAGCTCGGTGTCAGGGATGCGATCTCATCCATGATCTGAGCAGAAG...


In [88]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_16[healthy_16.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [89]:
healthy_ids = healthy_16['ids'].tolist()
healthy_list = healthy_16['seq'].tolist()


In [90]:
len(healthy_ids)


4965

In [91]:
len(healthy_list)

4965

In [92]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_16_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 697.9979186058044 seconds


###  Healthy 17

In [93]:
import pandas as pd

file_name = "Healthy17.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [94]:
# Convert to DataFrame
healthy_17 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [95]:
healthy_17

Unnamed: 0,ids,seq
0,k127_31361 flag=0 multi=6.9561 len=2858,AAAGATTTCCTGCTGGCATTTACAGCGGATGGAAGGAATAAGAACT...
1,k127_13442 flag=1 multi=7.0000 len=3341,TAATAGGAAGTGAAACGCTTTTTCAAAGTTAAAATAATATAGAAAG...
2,k127_44801 flag=1 multi=6.0000 len=3749,TTGCTGTACAGATGTTTATTTAAGAACATGATCAATGTCATGGTCG...
3,k127_40322 flag=1 multi=6.0000 len=4452,GCCCTCATCGCTCTGGCAGCATTCATCCTGATACGCAGCCAGGTGA...
4,k127_31369 flag=1 multi=6.0000 len=2690,TACACTACATTACGTATAAATCCACATATATGCCTAAATAGATAGC...
...,...,...
6666,k127_40294 flag=1 multi=27.0000 len=17187,AAGTATAGAGATAACTTTTATTTCTATGAAAACTGTGAAAAAGATG...
6667,k127_44776 flag=1 multi=6.0000 len=4195,ATTCCAGACGGAGAGGAGAGGACTTTCTATGTAATCGGAAAGAATT...
6668,k127_40298 flag=1 multi=6.9188 len=14678,GGGAAATCGGTAAAAGTGTATTATTTTGAGAGAGTTAAATCAATTA...
6669,k127_40308 flag=1 multi=4.0000 len=2563,AATAGGATTTTATTCGGTTTTGCGATAAAATATATTTATCTTTTTA...


In [96]:
# Define a function to extract the desired pattern
def extract_id_17(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_17'
    else:
        return None

# Apply the function to the 'ids' column
healthy_17['ids'] = healthy_17['ids'].apply(extract_id_17)



In [97]:
healthy_17

Unnamed: 0,ids,seq
0,k127_31361_17,AAAGATTTCCTGCTGGCATTTACAGCGGATGGAAGGAATAAGAACT...
1,k127_13442_17,TAATAGGAAGTGAAACGCTTTTTCAAAGTTAAAATAATATAGAAAG...
2,k127_44801_17,TTGCTGTACAGATGTTTATTTAAGAACATGATCAATGTCATGGTCG...
3,k127_40322_17,GCCCTCATCGCTCTGGCAGCATTCATCCTGATACGCAGCCAGGTGA...
4,k127_31369_17,TACACTACATTACGTATAAATCCACATATATGCCTAAATAGATAGC...
...,...,...
6666,k127_40294_17,AAGTATAGAGATAACTTTTATTTCTATGAAAACTGTGAAAAAGATG...
6667,k127_44776_17,ATTCCAGACGGAGAGGAGAGGACTTTCTATGTAATCGGAAAGAATT...
6668,k127_40298_17,GGGAAATCGGTAAAAGTGTATTATTTTGAGAGAGTTAAATCAATTA...
6669,k127_40308_17,AATAGGATTTTATTCGGTTTTGCGATAAAATATATTTATCTTTTTA...


In [98]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_17[healthy_17.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [99]:
healthy_ids = healthy_17['ids'].tolist()
healthy_list = healthy_17['seq'].tolist()


In [100]:
len(healthy_ids)


6671

In [101]:
len(healthy_list)

6671

In [102]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_17_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 586.1758382320404 seconds


###  Healthy 18

In [103]:
import pandas as pd

file_name = "Healthy18.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [104]:
# Convert to DataFrame
healthy_18 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [105]:
healthy_18

Unnamed: 0,ids,seq
0,k127_42281 flag=1 multi=8.9075 len=2634,CTGGAAAACAGGGCATCTAAAACACCAGAATTCTTTGGTTTTTTAA...
1,k127_37751 flag=1 multi=5.0000 len=2564,GCAGGGGCCACCCGGACCCAGGGGGACGAAGACAAGACGGCTTCTG...
2,k127_28691 flag=1 multi=4.0000 len=3188,GCTTCGCTCAGTGTCCAGGATGGGAACCACTCCCACTTGGGACGTT...
3,k127_4532 flag=1 multi=4.0000 len=3086,CTACCAAGGAGCTTATGAAGCTCACGCTGGAGCAGCGAATTGCTCA...
4,k127_27182 flag=1 multi=5.0000 len=2850,GTGATGCCCTCGGACAGGATGTTCAGGCACAGCACGGTGATCATGA...
...,...,...
9211,k127_30161 flag=1 multi=8.0000 len=11191,TGGGCCAATCGCATCATTAGTTAAATATTGATTACCTAACCCTACC...
9212,k127_30166 flag=1 multi=8.0000 len=5622,ATACCATGGATATTTCCTGCTCTTTTGCAGGTCAGGGTCTCTTTGA...
9213,k127_30170 flag=0 multi=6.9934 len=13906,GTTACTGTTCACGCATACGTGCCCAGTAACGGAATTTGCCGTGGCT...
9214,k127_30179 flag=1 multi=5.0000 len=2673,CGTACCGGCCAACTATGTTACAGCGTGGGTTGCTAAGGCGAGGAAG...


In [106]:
# Define a function to extract the desired pattern
def extract_id_18(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_18'
    else:
        return None

# Apply the function to the 'ids' column
healthy_18['ids'] = healthy_18['ids'].apply(extract_id_18)



In [107]:
healthy_18

Unnamed: 0,ids,seq
0,k127_42281_18,CTGGAAAACAGGGCATCTAAAACACCAGAATTCTTTGGTTTTTTAA...
1,k127_37751_18,GCAGGGGCCACCCGGACCCAGGGGGACGAAGACAAGACGGCTTCTG...
2,k127_28691_18,GCTTCGCTCAGTGTCCAGGATGGGAACCACTCCCACTTGGGACGTT...
3,k127_4532_18,CTACCAAGGAGCTTATGAAGCTCACGCTGGAGCAGCGAATTGCTCA...
4,k127_27182_18,GTGATGCCCTCGGACAGGATGTTCAGGCACAGCACGGTGATCATGA...
...,...,...
9211,k127_30161_18,TGGGCCAATCGCATCATTAGTTAAATATTGATTACCTAACCCTACC...
9212,k127_30166_18,ATACCATGGATATTTCCTGCTCTTTTGCAGGTCAGGGTCTCTTTGA...
9213,k127_30170_18,GTTACTGTTCACGCATACGTGCCCAGTAACGGAATTTGCCGTGGCT...
9214,k127_30179_18,CGTACCGGCCAACTATGTTACAGCGTGGGTTGCTAAGGCGAGGAAG...


In [108]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_18[healthy_18.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [109]:
healthy_ids = healthy_18['ids'].tolist()
healthy_list = healthy_18['seq'].tolist()


In [110]:
len(healthy_ids)


9216

In [111]:
len(healthy_list)

9216

In [112]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_18_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 1062.195752620697 seconds
