<a href="https://colab.research.google.com/github/MahdieRah/Protein_Feature_Extraction/blob/main/Numpy_Protein_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Random Mutation

In [11]:
import numpy as np
import random

In [12]:
protein_sequence = 'MKTLLILAVIMAS'  # Example protein (Methionine, Lysine, etc.)
amino_acid_map = {
    'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'E': 5, 'Q': 6, 'G': 7, 'H': 8, 'I': 9,
    'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19
}


In [13]:
# Reverse map for decoding
reverse_amino_acid_map = {v: k for k, v in amino_acid_map.items()}

In [14]:
protein_array = np.array([amino_acid_map[aa] for aa in protein_sequence])
print("Protein sequence:", protein_sequence)
print("Numerical representation:", protein_array)
print("Length of protein:", protein_array.size)

Protein sequence: MKTLLILAVIMAS
Numerical representation: [12 11 16 10 10  9 10  0 19  9 12  0 15]
Length of protein: 13


In [15]:
# Frequency of each amino acid
unique, counts = np.unique(protein_array, return_counts=True)
amino_acid_counts = dict(zip(unique, counts))

In [16]:
# Decode frequencies back to amino acids
decoded_counts = {list(amino_acid_map.keys())[list(amino_acid_map.values()).index(k)]: v for k, v in amino_acid_counts.items()}
print("Amino acid frequency:", decoded_counts)

Amino acid frequency: {'A': np.int64(2), 'I': np.int64(2), 'L': np.int64(3), 'K': np.int64(1), 'M': np.int64(2), 'S': np.int64(1), 'T': np.int64(1), 'V': np.int64(1)}


In [17]:
# Step 5: Simulate mutations
num_mutations = 3  # Number of mutations to simulate
mutation_indices = random.sample(range(protein_array.size), num_mutations)


In [18]:
# Randomly mutate amino acids
mutated_array = protein_array.copy()
for idx in mutation_indices:
    original_aa = mutated_array[idx]
    new_aa = original_aa
    while new_aa == original_aa:
        new_aa = random.randint(0, 19)  # Random new amino acid
    mutated_array[idx] = new_aa


In [19]:
# Display mutation results
mutated_sequence = ''.join(reverse_amino_acid_map[aa] for aa in mutated_array)
print("Mutated sequence:", mutated_sequence)

Mutated sequence: MKLLLVLAVIMCS


In [20]:
# Frequency of amino acids after mutation
unique_mut, counts_mut = np.unique(mutated_array, return_counts=True)
mutated_counts = {reverse_amino_acid_map[k]: v for k, v in zip(unique_mut, counts_mut)}

print("Amino acid frequency after mutation:", mutated_counts)


Amino acid frequency after mutation: {'A': np.int64(1), 'C': np.int64(1), 'I': np.int64(1), 'L': np.int64(4), 'K': np.int64(1), 'M': np.int64(2), 'S': np.int64(1), 'V': np.int64(2)}


Custom Amino Acid Mutations

In [21]:
import numpy as np

In [22]:
# Create a protein sequence
protein_sequence = 'MKTLLILAVIMAS'  # Example protein (Methionine, Lysine, etc.)
# Map amino acids to numbers
amino_acid_map = {
    'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'E': 5, 'Q': 6, 'G': 7, 'H': 8, 'I': 9,
    'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19
}


In [23]:
# Reverse map for decoding
reverse_amino_acid_map = {v: k for k, v in amino_acid_map.items()}

# Convert the sequence to a NumPy array
protein_array = np.array([amino_acid_map[aa] for aa in protein_sequence])

print("Original protein sequence:", protein_sequence)
print("Numerical representation:", protein_array)
print("Length of protein:", protein_array.size)

Original protein sequence: MKTLLILAVIMAS
Numerical representation: [12 11 16 10 10  9 10  0 19  9 12  0 15]
Length of protein: 13


In [24]:
# Frequency of each amino acid
unique, counts = np.unique(protein_array, return_counts=True)
amino_acid_counts = dict(zip(unique, counts))

# Decode frequencies back to amino acids
decoded_counts = {reverse_amino_acid_map[k]: v for k, v in amino_acid_counts.items()}
print("Amino acid frequency:", decoded_counts)

Amino acid frequency: {'A': np.int64(2), 'I': np.int64(2), 'L': np.int64(3), 'K': np.int64(1), 'M': np.int64(2), 'S': np.int64(1), 'T': np.int64(1), 'V': np.int64(1)}


In [32]:
# Step 5: Apply custom mutations
custom_mutations = [(2, 'A'), (5, 'W'), (10, 'R')]  # (position, new_amino_acid)

mutated_array = protein_array.copy()

for pos, new_aa in custom_mutations:
    if 0 <= pos < mutated_array.size and new_aa in amino_acid_map:
        mutated_array[pos] = amino_acid_map[new_aa]
    else:
        print(f"Invalid mutation: position {pos} or amino acid {new_aa}")


In [33]:
# Display mutation results
mutated_sequence = ''.join(reverse_amino_acid_map[aa] for aa in mutated_array)
print("Mutated sequence:", mutated_sequence)


Mutated sequence: MKALLWLAVIRAS


In [34]:
# Frequency of amino acids after mutation
unique_mut, counts_mut = np.unique(mutated_array, return_counts=True)
mutated_counts = {reverse_amino_acid_map[k]: v for k, v in zip(unique_mut, counts_mut)}

print("Amino acid frequency after mutation:", mutated_counts)

Amino acid frequency after mutation: {'A': np.int64(3), 'R': np.int64(1), 'I': np.int64(1), 'L': np.int64(3), 'K': np.int64(1), 'M': np.int64(1), 'S': np.int64(1), 'W': np.int64(1), 'V': np.int64(1)}
