In [None]:
#DONE

In [1]:
import pandas as pd
import os
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import load_model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc




In [2]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [3]:
def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = sequence[::-1]  # Reverse the sequence
    reverse_complement_seq = ''.join(complement[base] for base in reverse_seq)
    return reverse_complement_seq

def reverse_complement_list(sequence_list):
    reverse_complements = []
    for seq in sequence_list:
        reverse_complements.append(reverse_complement(seq))
    return reverse_complements

In [4]:
#This function gets the cgr for the sequences
def easy_cgr(dna_sequence):
  nucleotide_map = {
    "A": (1, 1),
    "T": (-1, -1),
    "C": (1, -1),
    "G": (-1, 1),
}
  values=[]
  point=(0,0)
  for nucleotide in dna_sequence:
        next_point=nucleotide_map[nucleotide]
        point=((point[0]+next_point[0])/2,(point[1]+next_point[1])/2)
        values.append(point)
  return values

In [5]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()

In [6]:
#This function reads the file when usingantibiotic resistance genes
import re

def read_antibiotic_resistance_genes(fasta_file_path):
    # Initialize lists to store sequence data
    sequence_ids = []
    sequences = []
    terms_inside_brackets = []  # Initialize a list to store terms inside brackets

    # Define a regular expression pattern to match text inside square brackets
    pattern = r'\[([^]]+)\]'

    # Open the FASTA file for reading
    with open(fasta_file_path, "r") as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace

            # Check if the line is an ID line (starts with '>')
            if line.startswith(">"):
                # If it's not the first sequence, process the previous one
                if sequence_ids:
                    term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
                    terms_inside_brackets.append(term_inside_brackets)  # Append to the list
                # Extract the sequence ID from the ID line
                sequence_ids.append(line[1:])
                sequences.append("")  # Reset the sequence
            else:
                # Append the line to the sequence
                sequences[-1] += line

    # Process the last sequence in the file
    if sequence_ids:
        term_inside_brackets = re.search(pattern, sequence_ids[-1]).group(1)
        terms_inside_brackets.append(term_inside_brackets)

    return sequences, terms_inside_brackets

In [7]:
from Bio import SeqIO

def read_fasta_patients(file_path):
    sequence_ids = []
    sequences = []
    
    try:
        with open(file_path, "r") as fasta_file:
            fasta_sequences = SeqIO.parse(fasta_file, "fasta")
            
            for seq_record in fasta_sequences:
                sequence_ids.append(seq_record.id)
                sequences.append(str(seq_record.seq))
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
    
    return sequences, sequence_ids


In [8]:
#This function has input:
#sequence to plot that is the easy_cgr(sequence)
# plot id that is the name
# resolution that is the resolution
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def plotting_cgr(sequence_to_plot,id_to_plot,resolution):

  # Define the resolution
  num_bins = resolution

  # Create a grid for the histogram
  xedges = np.linspace(-1, 1, num_bins + 1)
  yedges = np.linspace(-1, 1, num_bins + 1)

  # Create a 2D histogram
  hist, _, _ = np.histogram2d([p[0] for p in sequence_to_plot], [p[1] for p in sequence_to_plot], bins=[xedges, yedges])

  # Create a custom colormap spanning from light gray to dark gray
  colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
  cmap_name = "custom_gray"
  custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

  # Create a figure
  plt.figure()

  # Display the 2D histogram with the custom colormap
  plt.imshow(hist.T, cmap=custom_cmap, extent=[-1, 1, -1, 1], origin='lower', interpolation='nearest', aspect='auto')

  # Calculate coordinates for labels
  plt.xticks([])
  plt.yticks([])
  x_margin, y_margin = 0.05, 0.05
  label_coordinates = {
      "A": (1+ x_margin, 1 +y_margin),
      "T": (-1 - x_margin, -1-y_margin),
      "C": (1 + x_margin, -1 -y_margin),
      "G": (-1 - x_margin, 1 + y_margin),
  }

  # Add labels outside the grid
  for letter, (x, y) in label_coordinates.items():
      plt.text(x, y, letter, ha='center', va='center', fontsize=15, color='black')
  # Show the plot
  plt.title('CGR for '+id_to_plot)
  plt.grid(False)  # Turn off grid lines
  plt.show()


In [9]:
import random

def replace_ambiguous_bases(sequence):
    def resolve_ambiguity(code):
        if code == 'R':
            return random.choice(['A', 'G'])
        elif code == 'Y':
            return random.choice(['C', 'T'])
        elif code == 'S':
            return random.choice(['G', 'C'])
        elif code == 'W':
            return random.choice(['A', 'T'])
        elif code == 'K':
            return random.choice(['G', 'T'])
        elif code == 'M':
            return random.choice(['A', 'C'])
        elif code == 'B':
            return random.choice(['C', 'G', 'T'])
        elif code == 'D':
            return random.choice(['A', 'G', 'T'])
        elif code == 'H':
            return random.choice(['A', 'C', 'T'])
        elif code == 'V':
            return random.choice(['A', 'C', 'G'])
        elif code == 'N':
            return random.choice(['A', 'T', 'C', 'G'])
        else:
            return code  # If the code is not an ambiguity code, return the same code
    
    resolved_sequence = ''.join(resolve_ambiguity(base) for base in sequence)
    return resolved_sequence

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(84 / dpi, 84 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images(sequences, ids, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    

    for i, (sequence, label) in enumerate(zip(sequences, ids)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{i}_{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

In [11]:

def read_fasta_ARG(file_name):
    sequences = []
    with open(file_name, 'r') as file:
        sequence_id = ''
        sequence = ''
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if sequence_id != '':
                    sequences.append(sequence_id.split('|') + [sequence])
                sequence_id = line[1:]
                sequence = ''
            else:
                sequence += line
        if sequence_id != '':
            sequences.append(sequence_id.split('|') + [sequence])
    return sequences

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

def generate_cgr_and_save(sequence_to_plot, kmeres, output_filename):
    # Define the resolution
    num_bins = 2 ** kmeres

    # Create a grid for the histogram
    xedges = np.linspace(-1, 1, num_bins + 1)
    yedges = np.linspace(-1, 1, num_bins + 1)

    # Create a 2D histogram
    hist, _, _ = np.histogram2d(
        [p[0] for p in sequence_to_plot],
        [p[1] for p in sequence_to_plot],
        bins=[xedges, yedges]
    )

    # Create a custom colormap spanning from light gray to dark gray
    colors = [(1, 1, 1), (0.2, 0.2, 0.2)]  # Light gray to dark gray
    cmap_name = "custom_gray"
    custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=len(sequence_to_plot) + 1)

    # Create a square figure with the desired size in inches
    dpi = 100
    fig, ax = plt.subplots(figsize=(100 / dpi, 100 / dpi), dpi=dpi)

    # Plot the histogram using imshow and set the extent to specify the image size
    ax.imshow(
        hist.T,
        cmap=custom_cmap,
        origin='lower',
        extent=[-1, 1, -1, 1],  # Setting extent to create a 100x100 pixel image
        interpolation='nearest'  # Optional: adjust the interpolation method
    )

    # Turn off the axis
    ax.axis('off')

    # Save the figure directly as an image using matplotlib.pyplot.savefig
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0, dpi=dpi)
    plt.close()

# Your remaining function remains unchanged
def generate_and_save_cgr_images_label(sequences, class_labels, output_dir, kmeres):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i, (sequence, label) in enumerate(zip(sequences, class_labels)):
        # Generate the image filename based on the class label and sequence index
        image_filename = os.path.join(output_dir, f"{label}.png")

        # Generate and save the CGR image
        generate_cgr_and_save(sequence, kmeres, image_filename)

###  Patient 14

In [13]:
import pandas as pd

file_name = "Patient14.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [14]:
# Convert to DataFrame
patient_14 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [15]:
patient_14

Unnamed: 0,ids,seq
0,k127_20391 flag=0 multi=4.0000 len=2562,GCAGAAGCAGTGAAGACCGCTGGGTATGACAGCCTGGATGCTGCAA...
1,k127_81555 flag=1 multi=4.0000 len=3470,ATCTAGGGCCGATCCAGGGGCATAATATCCCGACGATGGTACGTGC...
2,k127_61173 flag=1 multi=4.0000 len=2617,ATCTTTCAGTCCCAGACGATAAAAAAATTCTTCTGTATCTGCTCCA...
3,k127_20396 flag=0 multi=9.9554 len=4163,AAATTTTATTTATGTGGGTATCGATTTACACAAAGAAACCCACACA...
4,k127_5 flag=0 multi=12.9834 len=7343,TTTTCATGAAAAATAACCTCTTTACAAATATAAGCTTGACATAAGT...
...,...,...
16322,k127_162953 flag=3 multi=9.0027 len=3419,AGCCTTCATGCTAAAATCACCGTACACCAAACAAACTATCTAATGA...
16323,k127_162955 flag=3 multi=25.0050 len=5148,CAGGTTTACAGCTTAGAGATATTTGTAGAGGTGTTATTAGCAAAGA...
16324,k127_162959 flag=3 multi=22.0074 len=3103,GGCTTGCAGGCTAAAATGAACAGTACCGCTGCGCGCGAACGATACA...
16325,k127_162960 flag=3 multi=322.0543 len=6060,CGCTTTTCGTAAGGGCTGAAATCGTATTTTGCGCATGTCCATATGT...


In [16]:
# Define a function to extract the desired pattern
def extract_id_14(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_14'
    else:
        return None

# Apply the function to the 'ids' column
patient_14['ids'] = patient_14['ids'].apply(extract_id_14)



In [17]:
patient_14

Unnamed: 0,ids,seq
0,k127_20391_14,GCAGAAGCAGTGAAGACCGCTGGGTATGACAGCCTGGATGCTGCAA...
1,k127_81555_14,ATCTAGGGCCGATCCAGGGGCATAATATCCCGACGATGGTACGTGC...
2,k127_61173_14,ATCTTTCAGTCCCAGACGATAAAAAAATTCTTCTGTATCTGCTCCA...
3,k127_20396_14,AAATTTTATTTATGTGGGTATCGATTTACACAAAGAAACCCACACA...
4,k127_5_14,TTTTCATGAAAAATAACCTCTTTACAAATATAAGCTTGACATAAGT...
...,...,...
16322,k127_162953_14,AGCCTTCATGCTAAAATCACCGTACACCAAACAAACTATCTAATGA...
16323,k127_162955_14,CAGGTTTACAGCTTAGAGATATTTGTAGAGGTGTTATTAGCAAAGA...
16324,k127_162959_14,GGCTTGCAGGCTAAAATGAACAGTACCGCTGCGCGCGAACGATACA...
16325,k127_162960_14,CGCTTTTCGTAAGGGCTGAAATCGTATTTTGCGCATGTCCATATGT...


In [18]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_14[patient_14.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [19]:
patient_ids = patient_14['ids'].tolist()
patient_list = patient_14['seq'].tolist()


In [20]:
len(patient_ids)


16327

In [21]:
len(patient_list)

16327

In [22]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_14_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 2094.450616836548 seconds


###  Patient 15

In [23]:
import pandas as pd

file_name = "Patient15.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [24]:
# Convert to DataFrame
patient_15 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [25]:
patient_15

Unnamed: 0,ids,seq
0,k127_18421 flag=1 multi=5.0000 len=2775,ATTTTCATTCTTGAAAGGGGGGTTCGATTCCCCCACGGGCTACAAT...
1,k127_128939 flag=1 multi=4.0000 len=2558,TGGCGGACCTGCGCCGCTGCCTGGAGGACGCCAGGGCCTCCGGCTG...
2,k127_36846 flag=1 multi=5.0000 len=2670,GTAAACTTCTTCCTTATCTGGGTTTTTATGGCGATGAGGGGGTAAA...
3,k127_55260 flag=1 multi=7.0000 len=5387,AGTCGTATTGACCTGAGCGCAGGTGGGCGGCTCGCTGTTGTCGGCA...
4,k127_55263 flag=1 multi=16.0000 len=3916,GCCCACCCCCACCCCGCGTCCCTCGCTCAGCTTTACCTACGGCGAT...
...,...,...
13482,k127_55227 flag=1 multi=6.0000 len=4027,ATATATTGACACACCCCTGGGTTTCCTCGTATAGTTTAGGGGCGAA...
13483,k127_55237 flag=1 multi=10.0000 len=20113,AAACTGTATCCTGACGCCGCATGTCGCGGGCGTGAGCTTCGGCCAC...
13484,k127_55240 flag=1 multi=7.0000 len=4970,GATGAGCCGGATACCGTTCCCATCAGGCCGGAGCTGACTACCGCGA...
13485,k127_55255 flag=1 multi=9.0000 len=47451,ATCCAAAGACTCTGGGAAATCTACCGTTTTCAAGCATAGTCATATT...


In [26]:
# Define a function to extract the desired pattern
def extract_id_15(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_15'
    else:
        return None

# Apply the function to the 'ids' column
patient_15['ids'] = patient_15['ids'].apply(extract_id_15)



In [27]:
patient_15

Unnamed: 0,ids,seq
0,k127_18421_15,ATTTTCATTCTTGAAAGGGGGGTTCGATTCCCCCACGGGCTACAAT...
1,k127_128939_15,TGGCGGACCTGCGCCGCTGCCTGGAGGACGCCAGGGCCTCCGGCTG...
2,k127_36846_15,GTAAACTTCTTCCTTATCTGGGTTTTTATGGCGATGAGGGGGTAAA...
3,k127_55260_15,AGTCGTATTGACCTGAGCGCAGGTGGGCGGCTCGCTGTTGTCGGCA...
4,k127_55263_15,GCCCACCCCCACCCCGCGTCCCTCGCTCAGCTTTACCTACGGCGAT...
...,...,...
13482,k127_55227_15,ATATATTGACACACCCCTGGGTTTCCTCGTATAGTTTAGGGGCGAA...
13483,k127_55237_15,AAACTGTATCCTGACGCCGCATGTCGCGGGCGTGAGCTTCGGCCAC...
13484,k127_55240_15,GATGAGCCGGATACCGTTCCCATCAGGCCGGAGCTGACTACCGCGA...
13485,k127_55255_15,ATCCAAAGACTCTGGGAAATCTACCGTTTTCAAGCATAGTCATATT...


In [28]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = patient_15[patient_15.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [29]:
patient_ids = patient_15['ids'].tolist()
patient_list = patient_15['seq'].tolist()


In [30]:
len(patient_ids)


13487

In [31]:
len(patient_list)

13487

In [32]:
import time
start_time = time.time()
seq_num=[]


all_seq = patient_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = patient_ids 
output_directory = "patient_15_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 2072.1546075344086 seconds


###  Healthy 19

In [33]:
import pandas as pd

file_name = "Healthy19.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [34]:
# Convert to DataFrame
healthy_19 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [35]:
healthy_19

Unnamed: 0,ids,seq
0,k127_82980 flag=1 multi=5.0000 len=2774,TGGAAGTGCAGACGGGCTTCTTGCTTTCTGTAAAGGAATTCAGGCT...
1,k127_128619 flag=0 multi=18.1557 len=3351,GGGGAAGCAGCGCCGTAGGCGCTGCACATTGATCCCTGCACCGTGC...
2,k127_53939 flag=1 multi=5.0000 len=3742,ACCGCCCCTGTTTCCTCCGGTTCTTCCTCCAGTCGTTTCCGCGTCC...
3,k127_120322 flag=1 multi=7.0000 len=4239,GGCATGAGGACAGAAAAACGGCGGTATTTTCAAGTTACGGTGAATA...
4,k127_103725 flag=1 multi=10.0000 len=5480,TTAATATATGGAATATATGGAAGTTAAAAAATTGGTAAGTTTTTTG...
...,...,...
14455,k127_165823 flag=3 multi=91.0169 len=5525,ATTCGAGGATTGGACATGGTAGAGCAATATGAAATATATTGGGTGG...
14456,k127_165825 flag=3 multi=17.9814 len=8846,CTTCGAGCTGCACCGTCCATGCAGAAAGCCCCTGATCCACTCTGGT...
14457,k127_165827 flag=3 multi=21.0033 len=6436,GACGAGTCGCAGACCTATAAAATGTGTTGGAAACATGTTAAAAAGT...
14458,k127_165837 flag=3 multi=29.0046 len=6496,CTGAAGATCCCGGCTCCATTCGGCCTTCATCTTCTCAAGCCGTTCA...


In [36]:
# Define a function to extract the desired pattern
def extract_id_19(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_19'
    else:
        return None

# Apply the function to the 'ids' column
healthy_19['ids'] = healthy_19['ids'].apply(extract_id_19)



In [37]:
healthy_19

Unnamed: 0,ids,seq
0,k127_82980_19,TGGAAGTGCAGACGGGCTTCTTGCTTTCTGTAAAGGAATTCAGGCT...
1,k127_128619_19,GGGGAAGCAGCGCCGTAGGCGCTGCACATTGATCCCTGCACCGTGC...
2,k127_53939_19,ACCGCCCCTGTTTCCTCCGGTTCTTCCTCCAGTCGTTTCCGCGTCC...
3,k127_120322_19,GGCATGAGGACAGAAAAACGGCGGTATTTTCAAGTTACGGTGAATA...
4,k127_103725_19,TTAATATATGGAATATATGGAAGTTAAAAAATTGGTAAGTTTTTTG...
...,...,...
14455,k127_165823_19,ATTCGAGGATTGGACATGGTAGAGCAATATGAAATATATTGGGTGG...
14456,k127_165825_19,CTTCGAGCTGCACCGTCCATGCAGAAAGCCCCTGATCCACTCTGGT...
14457,k127_165827_19,GACGAGTCGCAGACCTATAAAATGTGTTGGAAACATGTTAAAAAGT...
14458,k127_165837_19,CTGAAGATCCCGGCTCCATTCGGCCTTCATCTTCTCAAGCCGTTCA...


In [38]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_19[healthy_19.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [39]:
healthy_ids = healthy_19['ids'].tolist()
healthy_list = healthy_19['seq'].tolist()


In [40]:
len(healthy_ids)


14460

In [41]:
len(healthy_list)

14460

In [42]:
import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_19_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 2140.036254644394 seconds


###  Healthy 20

In [43]:
import pandas as pd

file_name = "Healthy20.contigs.fa"
fasta_sequences = read_fasta_ARG(file_name)

In [44]:
# Convert to DataFrame
healthy_20 = pd.DataFrame(fasta_sequences, columns=[ 'ids', 'seq'])


In [45]:
healthy_20

Unnamed: 0,ids,seq
0,k127_30519 flag=1 multi=3.3628 len=2875,ATATATATGTATGCCCTATGCACTACACAGTATCATTAGAAAGACA...
1,k127_54251 flag=0 multi=8.0000 len=3052,AGATGCTTACAAGGAATTAAATCCGGCAGATGTAGCTTACTATGAT...
2,k127_91540 flag=1 multi=6.0000 len=2527,CCTTATTGTTTAACGATGAAAAGAATATTGACAATCGTATAGGAGA...
3,k127_98319 flag=1 multi=8.0000 len=2713,GTCCCACACCGGGGTCAGGCTAGTCTGCCGCTGTTCCCCGCTGCGG...
4,k127_10173 flag=0 multi=5.0757 len=2689,GGATATGATGGAGCTGGCCCAGGAGTACCGCACCAAGCTGCTGGAC...
...,...,...
14888,k127_30435 flag=1 multi=7.7513 len=3814,TCATGCAGTTTGGGCCTTCCTTACTGCATCTTCGGTCAATCATATC...
14889,k127_30436 flag=1 multi=6.0000 len=4065,CTTATCATTCGGCTGTACTCCGACTTGTTCTATATAGCTGTGCGGA...
14890,k127_30446 flag=1 multi=6.0000 len=6206,CCGATAAAACCGGACAATGTGCGTTACAGAGCAAAACGACTGTTAC...
14891,k127_30447 flag=1 multi=8.0000 len=4350,ATATATAGACATTTACTCTCTGGGAATTTGGAGCTTAGTTCTAATT...


In [46]:
# Define a function to extract the desired pattern
def extract_id_20(string):
    match = re.search(r'(k127_\d+)', string)
    if match:
        return match.group(1) + '_20'
    else:
        return None

# Apply the function to the 'ids' column
healthy_20['ids'] = healthy_20['ids'].apply(extract_id_20)



In [47]:
healthy_20

Unnamed: 0,ids,seq
0,k127_30519_20,ATATATATGTATGCCCTATGCACTACACAGTATCATTAGAAAGACA...
1,k127_54251_20,AGATGCTTACAAGGAATTAAATCCGGCAGATGTAGCTTACTATGAT...
2,k127_91540_20,CCTTATTGTTTAACGATGAAAAGAATATTGACAATCGTATAGGAGA...
3,k127_98319_20,GTCCCACACCGGGGTCAGGCTAGTCTGCCGCTGTTCCCCGCTGCGG...
4,k127_10173_20,GGATATGATGGAGCTGGCCCAGGAGTACCGCACCAAGCTGCTGGAC...
...,...,...
14888,k127_30435_20,TCATGCAGTTTGGGCCTTCCTTACTGCATCTTCGGTCAATCATATC...
14889,k127_30436_20,CTTATCATTCGGCTGTACTCCGACTTGTTCTATATAGCTGTGCGGA...
14890,k127_30446_20,CCGATAAAACCGGACAATGTGCGTTACAGAGCAAAACGACTGTTAC...
14891,k127_30447_20,ATATATAGACATTTACTCTCTGGGAATTTGGAGCTTAGTTCTAATT...


In [48]:
# Assuming df_antibiotics_and_conjugate is your DataFrame
# You can replace it with the name of your actual DataFrame

# Check for duplicates in the "ids" column
duplicates = healthy_20[healthy_20.duplicated('seq')]

# If there are duplicates, duplicates DataFrame will contain those rows
# If duplicates is empty, it means there are no duplicate ids
if not duplicates.empty:
    print("There are duplicate seq in the DataFrame:")
    print(duplicates)
else:
    print("There are no duplicate seq in the DataFrame.")


There are no duplicate seq in the DataFrame.


In [49]:
healthy_ids = healthy_20['ids'].tolist()
healthy_list = healthy_20['seq'].tolist()


In [50]:
len(healthy_ids)


14893

In [51]:
len(healthy_list)

14893

In [52]:
## import time
start_time = time.time()
seq_num=[]


all_seq = healthy_list
all_seq = [replace_ambiguous_bases(seq) for seq in all_seq]
for sequence in all_seq:
        seq_cgr = easy_cgr(sequence)
        seq_num.append(seq_cgr)
sequences = seq_num
class_labels = healthy_ids 
output_directory = "healthy_20_res_7"
kmeres=7

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Call the function to generate and save CGR images
generate_and_save_cgr_images_label(sequences, class_labels, output_directory,kmeres)

end_time = time.time()
elapsed_time = end_time - start_time

# Print the time taken
print(f"Time to generate images: {elapsed_time} seconds")

Time to generate images: 3299.8517866134644 seconds
