In [74]:
from Bio import SeqIO
import os
import random

def read_fasta(directory='.'):
    # Get a list of all .fas files in the specified directory
    fas_files = [f for f in os.listdir(directory) if f.endswith('.fas')]
    # Choose a random .fas file from the list
    file = random.choice(fas_files)
    print(f"File: {file}")

    # Read the MSA file
    records = SeqIO.parse(os.path.join(directory, file), "fasta")

    # Dictionary to store sequences by family
    sequences_by_family = {}

    # Process each sequence record
    for record in records:
        # Check if the record ID contains an underscore
        if '_' in record.id:
            # Extract family information from the record ID
            family = record.id.split("_")[1]

            # Add the sequence to the corresponding family in the dictionary
            if family in sequences_by_family:
                sequences_by_family[family].append(record.seq)
            else:
                sequences_by_family[family] = [record.seq]
        else:
            print(f"Warning: Skipping record with ID '{record.id}' because it does not contain an underscore")

    # Print summary statistics
    total_sequences = sum(len(sequences) for sequences in sequences_by_family.values())
    print(f"Total sequences: {total_sequences}")
    
        ## can be un commented to print the famlies    
#     print(f"Total unique families: {len(sequences_by_family)}")
#     for family, sequences in sequences_by_family.items():
#         print(f"Family {family}: {len(sequences)} sequences")

      #  # can be un commented to print the famlies
 #   # Print the sequences by family
#     for family, sequences in sequences_by_family.items():
#         print(f"\nFamily: {family}")
#         for sequence in sequences:
#             print(sequence)
    #print (sequences_by_family)
    return sequences_by_family



In [75]:
sequences = read_fasta()

File: ISCA_ISCR_20_id90.fas
Total sequences: 289


In [76]:
print (sequences)

{'F5JEQ1': [Seq('--MTMTGAAASRVKAIIENSGPAKGVRVGIKKGGCAGMEYTIDLVTEADAKDDL...---')], 'C4L7K0': [Seq('MAITLSDSAAERVKRFLTNRGKGVGLRLGVKTSGCSGMAYVLEFVDEVHEEDQV...---')], 'K9XJG9': [Seq('-MVRLSTSAAKEIQRLQAKQHNN-VFRLKVQAGGCSSLLYNMEFDTSLTPSDRV...---')], 'C3X6S5': [Seq('--INFTDNAVKKVAELIAEEGNNLKLRVFVQGGGCAGFQYGFTFDENINEDDTT...---')], 'E1D441': [Seq('MAITMTETAASRVKAFLDNRGKGIGLRLGVKTTGCSGMAYVLEFVDELNEEDEV...VR-')], 'F9ZGT7': [Seq('--ITLTENAAKYIQQQLAKRGKGLALRVGIKKSGCSGFSYTFDYADEINPDDEL...AY-')], 'Q3SJN3': [Seq('MAVTLSERAAQHVTNFLAKRGKGVGIRLGVRTSGCSGMAYKLEFADETPEGDEV...---')], 'F4QQC9': [Seq('--VTLTDAAADQVKAIMARADNYAGLRVGVKQGGCAGQEYVLSYADAIGPLDEV...---')], 'Q603C2': [Seq('--VTVTENAAAQIARQLRRRGHGLGLRLGVRQAGCSGYSYVVDYADEIAADDAV...---')], 'F7P0R8': [Seq('MAISMTPAAADRVRSFLANRGKGLGLRVGVKTTGCSGLAYVLEFVDELNDDDQV...---')], 'A4U0G8': [Seq('--ITITESAAQRVRAMLDKRGKPSGIRIGVRSKGCSGMQYTLEYADEKSPFDEV...---')], 'Q65RT0': [Seq('ASIGMTESAAKHVKKCLESRGKGIGLRLGIKTSGCSGLAYVLEFVDELNSDDNV...---')], 'D5CLN6': [Seq('MTISLTENAAK

In [77]:
# Example usage
read_fasta()  # Read from current directory
#read_fasta('/path/to/dir')  # Read from specified directory

File: ISCA_ISCR_20_id90.fas
Total sequences: 289


{'F5JEQ1': [Seq('--MTMTGAAASRVKAIIENSGPAKGVRVGIKKGGCAGMEYTIDLVTEADAKDDL...---')],
 'C4L7K0': [Seq('MAITLSDSAAERVKRFLTNRGKGVGLRLGVKTSGCSGMAYVLEFVDEVHEEDQV...---')],
 'K9XJG9': [Seq('-MVRLSTSAAKEIQRLQAKQHNN-VFRLKVQAGGCSSLLYNMEFDTSLTPSDRV...---')],
 'C3X6S5': [Seq('--INFTDNAVKKVAELIAEEGNNLKLRVFVQGGGCAGFQYGFTFDENINEDDTT...---')],
 'E1D441': [Seq('MAITMTETAASRVKAFLDNRGKGIGLRLGVKTTGCSGMAYVLEFVDELNEEDEV...VR-')],
 'F9ZGT7': [Seq('--ITLTENAAKYIQQQLAKRGKGLALRVGIKKSGCSGFSYTFDYADEINPDDEL...AY-')],
 'Q3SJN3': [Seq('MAVTLSERAAQHVTNFLAKRGKGVGIRLGVRTSGCSGMAYKLEFADETPEGDEV...---')],
 'F4QQC9': [Seq('--VTLTDAAADQVKAIMARADNYAGLRVGVKQGGCAGQEYVLSYADAIGPLDEV...---')],
 'Q603C2': [Seq('--VTVTENAAAQIARQLRRRGHGLGLRLGVRQAGCSGYSYVVDYADEIAADDAV...---')],
 'F7P0R8': [Seq('MAISMTPAAADRVRSFLANRGKGLGLRVGVKTTGCSGLAYVLEFVDELNDDDQV...---')],
 'A4U0G8': [Seq('--ITITESAAQRVRAMLDKRGKPSGIRIGVRSKGCSGMQYTLEYADEKSPFDEV...---')],
 'Q65RT0': [Seq('ASIGMTESAAKHVKKCLESRGKGIGLRLGIKTSGCSGLAYVLEFVDELNSDDNV...---')],
 'D5CLN6': [Seq(

In [78]:

def read_sequence(sequences):
    # Flatten the list of sequences
    all_sequences = [seq for seq_list in sequences.values() for seq in seq_list]
    # Choose a random sequence from the list
    return random.choice(all_sequences)

In [79]:
seq = read_sequence(sequences)
print(seq)

MALTLSERAAQHVGAFLAKRGKGLGVRLGVKTSGCSGMAYKLEFVDVESSDDVRFESYGVAIYTDAKSLAYIDGTELDYVKEGLNEGFRFNNPNVKNECGCGESFNVMRLTTKGRFAVTAMLDLAMREEGGPVTLAGISERQSISLSYLEQLFGKLRRAELVDSVRGPGGGYTLAKQICDISVADIIVAVDEPVDATQCGGRENCRGSQRCMTHDLWTNLNVTIFDYLSKVSLASLVEEQ-----------------------------


In [80]:
import numpy as np

def one_hot_encoding(seq):
    # Define the amino acid alphabet
    aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY'
    # Create a dictionary to map each amino acid to its index in the alphabet
    aa_to_index = {aa: i for i, aa in enumerate(aa_alphabet)}
    # Initialize the one-hot encoded array
    one_hot = np.zeros((len(seq), len(aa_alphabet)))
    # Set the appropriate elements to 1
    for i, aa in enumerate(seq):
        if aa in aa_to_index:
            one_hot[i, aa_to_index[aa]] = 1
    return one_hot




In [81]:

one_hot = one_hot_encoding(seq)
print(one_hot)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [82]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import numpy as np

def one_hot_encoding(seq):
    # Define the amino acid alphabet
    aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY-'
    # Create a dictionary to map each amino acid to its index in the alphabet
    aa_to_index = {aa: i for i, aa in enumerate(aa_alphabet)}
    # Initialize the one-hot encoded array
    one_hot = np.zeros((len(seq), len(aa_alphabet)))
    # Set the appropriate elements to 1
    for i, aa in enumerate(seq):
        if aa in aa_to_index:
            one_hot[i, aa_to_index[aa]] = 1
    return one_hot

# Define the size of the latent space
latent_dim = 32

# Define the autoencoder architecture
input_seq = Input(shape=(None, len(aa_alphabet)))
encoded = Dense(latent_dim, activation='relu')(input_seq)
decoded = Dense(len(aa_alphabet), activation='softmax')(encoded)
autoencoder = Model(input_seq, decoded)

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='categorical_crossentropy')




In [83]:
# Example sequence
seq = seq


In [84]:
# Encode the sequence using one-hot encoding
seq_encoded = one_hot_encoding(seq).reshape(1, -1, len(aa_alphabet))



In [85]:
# Train the autoencoder on the encoded sequence
autoencoder.fit(seq_encoded, seq_encoded, epochs=100)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7ff0d0084dc0>

In [86]:
# Use the autoencoder to encode and decode the sequence
seq_decoded = autoencoder.predict(seq_encoded)



In [87]:
# Compute reconstruction error metrics
cross_entropy = -np.mean(np.sum(seq_encoded * np.log(seq_decoded), axis=-1))
same_aa_accuracy = np.mean(np.argmax(seq_encoded, axis=-1) == np.argmax(seq_decoded, axis=-1))
most_common_aa_accuracy = np.mean(np.argmax(seq_encoded.sum(axis=0), axis=-1) == np.argmax(seq_decoded.sum(axis=0), axis=-1))



In [88]:
print(f"Cross-entropy: {cross_entropy}")
print(f"Same amino acid accuracy: {same_aa_accuracy}")
print(f"Most common amino acid accuracy: {most_common_aa_accuracy}")

Cross-entropy: 2.325452879015841
Same amino acid accuracy: 0.7992565055762082
Most common amino acid accuracy: 0.7992565055762082


In [89]:
#decoding the sequence back

In [90]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import numpy as np

def one_hot_encoding(seq):
    # Define the amino acid alphabet
    aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY-'
    # Create a dictionary to map each amino acid to its index in the alphabet
    aa_to_index = {aa: i for i, aa in enumerate(aa_alphabet)}
    # Initialize the one-hot encoded array
    one_hot = np.zeros((len(seq), len(aa_alphabet)))
    # Set the appropriate elements to 1
    for i, aa in enumerate(seq):
        if aa in aa_to_index:
            one_hot[i, aa_to_index[aa]] = 1
    return one_hot

def decode_sequence(one_hot_seq):
    # Define the amino acid alphabet
    aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY-'
    # Convert the one-hot encoded sequence back into an amino acid sequence
    return ''.join([aa_alphabet[i] for i in np.argmax(one_hot_seq, axis=-1)])

# Define the size of the latent space
latent_dim = 32

# Define the autoencoder architecture
input_seq = Input(shape=(None, len(aa_alphabet)))
encoded = Dense(latent_dim, activation='relu')(input_seq)
decoded = Dense(len(aa_alphabet), activation='softmax')(encoded)
autoencoder = Model(input_seq, decoded)

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='categorical_crossentropy')



In [91]:

# Encode the sequence using one-hot encoding
seq_encoded = one_hot_encoding(seq).reshape(1, -1, len(aa_alphabet))



In [92]:
# Train the autoencoder on the encoded sequence
autoencoder.fit(seq_encoded, seq_encoded, epochs=100)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7ff0d03c1340>

In [93]:
# Use the autoencoder to encode and decode the sequence
seq_decoded = autoencoder.predict(seq_encoded)






In [94]:
# Compute reconstruction error metrics
cross_entropy = -np.mean(np.sum(seq_encoded * np.log(seq_decoded), axis=-1))
same_aa_accuracy = np.mean(np.argmax(seq_encoded, axis=-1) == np.argmax(seq_decoded, axis=-1))
most_common_aa_accuracy = np.mean(np.argmax(seq_encoded.sum(axis=0), axis=-1) == np.argmax(seq_decoded.sum(axis=0), axis=-1))



In [95]:
print(f"Cross-Entropy: {cross_entropy}")
print(f"Accuracy (Same aa) : {same_aa_accuracy}")
print(f"Accuracy (Mode aa): {most_common_aa_accuracy}")



Cross-Entropy: 2.2264906894761833
Accuracy (Same aa) : 0.8178438661710037
Accuracy (Mode aa): 0.8178438661710037


In [96]:
# Decode and print the original and reconstructed sequences
print("Original sequence:")
print(decode_sequence(seq_encoded[0]))
print("Reconstructed sequence:")
print(decode_sequence(seq_decoded[0]))

Original sequence:
MALTLSERAAQHVGAFLAKRGKGLGVRLGVKTSGCSGMAYKLEFVDVESSDDVRFESYGVAIYTDAKSLAYIDGTELDYVKEGLNEGFRFNNPNVKNECGCGESFNVMRLTTKGRFAVTAMLDLAMREEGGPVTLAGISERQSISLSYLEQLFGKLRRAELVDSVRGPGGGYTLAKQICDISVADIIVAVDEPVDATQCGGRENCRGSQRCMTHDLWTNLNVTIFDYLSKVSLASLVEEQ-----------------------------
Reconstructed sequence:
EALTLSERAALVVGALLA-RG-GLGVRLGV-TSGCSGEAY-LELVDVESSDDVRLESYGVAVYTDA-SLAYVDGTELDYV-EGLNEGLRLNNGNV-NECGCGESLNVERLTT-GRLAVTAELDLAEREEGGGVTLAGVSERLSVSLSYLELLLG-LRRAELVDSVRGGGGGYTLA-LVCDVSVADVVVAVDEGVDATLCGGRENCRGSLRCETVDL-TNLNVTVLDYLS-VSLASLVEEL-----------------------------
