In [None]:
'''
Final project propsal: https://docs.google.com/document/d/1pBP7qgE7IPM5wRwO8H84MsP3jLsQbQSX3jQA_kuoAP8/edit?tab=t.mhfvqsa8ghdn
Shared Data folder: https://drive.google.com/drive/u/0/folders/1a9fFnL2W4J3VtiLXA7opf2jXJLLCL0LY

1)Read in:
- protein_fasta_protein_homolog_model.fasta
- aro_index.tsv

2) Look at the ARO accession of aro_index.tsv and map it to the sequence entry in protein_fasta_protein_homolog_model.fasta

3) Generate k-mers for different k-values (k=3, 5, and 7)

4) Create frequency vector for each sequence based on the k-mer

'''

'\nFinal project propsal: https://docs.google.com/document/d/1pBP7qgE7IPM5wRwO8H84MsP3jLsQbQSX3jQA_kuoAP8/edit?tab=t.mhfvqsa8ghdn \nShared Data folder: https://drive.google.com/drive/u/0/folders/1a9fFnL2W4J3VtiLXA7opf2jXJLLCL0LY\n\n1)Read in: \n- protein_fasta_protein_homolog_model.fasta\n- aro_index.tsv\n\n2) Look at the ARO accession of aro_index.tsv and map it to the sequence entry in protein_fasta_protein_homolog_model.fasta \n\n3) Generate k-mers for different k-values (k=3, 5, and 7)\n\n4) Create frequency vector for each sequence based on the k-mer\n\n'

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/3.3 MB[0m [31m23.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import pandas as pd
from Bio import SeqIO

# File paths
fasta_file = "/content/drive/My Drive/CX4803_project/card-data/protein_fasta_protein_homolog_model.fasta"
tsv_file = "/content/drive/My Drive/CX4803_project/card-data/aro_index.tsv"

# Step 1: Read TSV and create mapping from ARO Accession to Drug Class
aro_df = pd.read_csv(tsv_file, sep='\t')
aro_to_drug_class = dict(zip(aro_df["ARO Accession"], aro_df["Drug Class"]))

# Step 2: Parse FASTA and build the dataset
records = []
for record in SeqIO.parse(fasta_file, "fasta"):
    aro_id = record.description.split('|')[2]  # ARO Accession


    sequence = str(record.seq)
    drug_class = aro_to_drug_class.get(aro_id, "Unknown")

    records.append({
        "aro_accession": aro_id,
        "sequence": sequence,
        "drug_class": drug_class
    })

# Step 3: Create DataFrame
df = pd.DataFrame(records)
df_new = df.drop('aro_accession', axis=1)
df_new


Unnamed: 0,sequence,drug_class
0,MKAYFIAILTLFTCIATVVRAQQMSELENRIDSLLNGKKATVGIAV...,cephalosporin
1,MRYIRLCIISLLAALPLAVHASPQPLEQIKQSESQLSGRVGMIEMD...,cephalosporin;penicillin beta-lactam
2,MIGLIVARSKNNVIGKNGNIPWKIKGEQKQFRELTTGNVVIMGRKS...,diaminopyrimidine antibiotic
3,MVTKRVQRMMFAAAACIPLLLGSAPLYAQTSAVQQKLAALEKSSGG...,cephalosporin
4,MELPNIMHPVAKLSTALAAALMLSGCMPGEIRPTIGQQMETGDQRF...,carbapenem;cephalosporin;penicillin beta-lactam
...,...,...
6043,MLKERFRQTVFIAAAVMPFIFSSTSLHAQATSDVQQVQKKLAALEK...,cephalosporin;penicillin beta-lactam
6044,MKTVFILISMLFPVAVMAQKSVKISDDISITQLSDKVYTYVSLAEI...,carbapenem
6045,MKTVFILISMLFPVAVMAQKSVKISDDISITQLSDKVYTYVSLAEI...,carbapenem
6046,MKTVFILISMLFPVAVMAQKSVKISDDISITQLSDKVYTYVSLAEI...,carbapenem


In [None]:
na_rows = df_new['drug_class'].isna()
df_new[na_rows]


Unnamed: 0,sequence,drug_class


In [None]:
# Step 4: Angela's Encoding Attempt -- K-mer Frequency Representation Encoding (Nucleotide sequences --> numerical vectors)

'''
 Breaking down each sequence into overlapping substrings of length k, known as k-mers. For instance, a sequence like "AGAT" can be decomposed into 2-mers: "AG," "GA," and "AT."
'''
from collections import Counter

def get_kmers(sequence, k):
  '''
  Generates k-mers of length K
  '''
  return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]


def get_kmer_freq_vector(sequence, k):
  '''
  Gets frequency of each kmer in the sequence and returns dict-like object of kmer: count of kmer
  '''
  kmers = get_kmers(sequence, k)
  kmer_counts = Counter(kmers)
  return kmer_counts

def get_all_kmers(df, k):
  '''
  Extract all unique kmers across the dataset for a given k
  '''
  all_kmers = set()
  for seq in df['sequence']:
      kmers = get_kmers(seq, k)
      all_kmers.update(kmers)
  return sorted(all_kmers)

k = 3

all_kmers = get_all_kmers(df_new, k) # getting unqiue k-mers for k of 3


def kmer_to_vector(sequence, all_kmers, k):
  '''
  Converts k-mer frequency into a fixed-length vector (using the sorted list of all kmers)
  '''
  kmer_counts = get_kmer_freq_vector(sequence, k)
  return [kmer_counts.get(kmer, 0) for kmer in all_kmers]

# Apply k-mer frequency vectorization to the sequences
'''
- iterating over all possible unique k-mers
  - if sequence contains that kmer, it's added to vector
'''
df_new['kmer_vector'] = df_new['sequence'].apply(lambda seq: kmer_to_vector(seq, all_kmers, k))

df_new.head()

Unnamed: 0,sequence,drug_class,kmer_vector
0,MKAYFIAILTLFTCIATVVRAQQMSELENRIDSLLNGKKATVGIAV...,cephalosporin,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,MRYIRLCIISLLAALPLAVHASPQPLEQIKQSESQLSGRVGMIEMD...,cephalosporin;penicillin beta-lactam,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, ..."
2,MIGLIVARSKNNVIGKNGNIPWKIKGEQKQFRELTTGNVVIMGRKS...,diaminopyrimidine antibiotic,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,MVTKRVQRMMFAAAACIPLLLGSAPLYAQTSAVQQKLAALEKSSGG...,cephalosporin,"[5, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, ..."
4,MELPNIMHPVAKLSTALAAALMLSGCMPGEIRPTIGQQMETGDQRF...,carbapenem;cephalosporin;penicillin beta-lactam,"[1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, ..."
