In [12]:
import random

def introduce_dna_errors(codewords, error_prob=0.01):
    # Define possible DNA bases
    dna_bases = ['A', 'T', 'G', 'C']
    
    modified_codewords = []
    
    for word in codewords:
        modified_word = []
        
        for base in word:
            if random.random() < error_prob:
                error_type = random.choice(['insert', 'delete', 'substitute'])
                
                if error_type == 'insert':
                    # Insert a random DNA base before the current base
                    modified_word.append(random.choice(dna_bases))
                    modified_word.append(base)
                elif error_type == 'delete':
                    # Skip the current base to delete it
                    continue
                elif error_type == 'substitute':
                    # Substitute with a random base different from the current one
                    new_base = random.choice([b for b in dna_bases if b != base])
                    modified_word.append(new_base)
            else:
                # No error, append the original base
                modified_word.append(base)
        
        # Join the modified characters to form the modified codeword
        modified_codewords.append("".join(modified_word))
    
    return modified_codewords

# Example usage
codewords = ["AATGC", "TTGCA", "GCCAT"]
result = introduce_dna_errors(codewords, error_prob=0.2)
print("Original codewords:", codewords)
print("Modified codewords:", result)

Original codewords: ['AATGC', 'TTGCA', 'GCCAT']
Modified codewords: ['AACGC', 'TTGAA', 'GCCAT']


In [13]:
def count_differences(str1, str2):
    # Check if strings are of equal length
    if len(str1) != len(str2):
        raise ValueError("Strings must be of the same length to compare positions.")
    
    # Count differing positions
    differences = sum(1 for a, b in zip(str1, str2) if a != b)
    return differences

# Example usage
string1 = "AATGCC"
string2 = "AATGCA"
result = count_differences(string1, string2)
print(f"Number of differing positions: {result}")

Number of differing positions: 1


In [14]:
import random

def introduce_sparse_errors(codewords, error_prob=1/104):
    # Define possible DNA bases
    dna_bases = ['A', 'T', 'G', 'C']
    
    modified_codewords = []
    
    for word in codewords:
        modified_word = []
        
        for base in word:
            # Decide whether to introduce an error based on the probability
            if random.random() < error_prob:
                error_type = random.choice(['insert', 'delete', 'substitute'])
                
                if error_type == 'insert':
                    # Insert a random base
                    modified_word.append(random.choice(dna_bases))
                    modified_word.append(base)
                elif error_type == 'delete':
                    # Skip the current base to delete it
                    continue
                elif error_type == 'substitute':
                    # Substitute with a random base different from the current one
                    new_base = random.choice([b for b in dna_bases if b != base])
                    modified_word.append(new_base)
            else:
                # No error, append the original base
                modified_word.append(base)
        
        # Join the modified characters to form the modified codeword
        modified_codewords.append("".join(modified_word))
    
    return modified_codewords

# Example usage
# Generating a list of 200 DNA codewords, each of length 100
codewords = ["".join(random.choices(['A', 'T', 'G', 'C'], k=100)) for _ in range(200)]

# Apply errors with the approximate rate of every 104-105 nucleotides
result = introduce_sparse_errors(codewords, error_prob=1/104)
print("Original codewords:", codewords[:5])  # Display first 5 for brevity
print("Modified codewords:", result[:5])     # Display first 5 for brevity

Original codewords: ['GTTGCTCGCATGATTTACCAGCATTCATTAAGAGCGGGACTTTCGTGCCCGTATGTGGTTGGTTTCTAGAGGAATCGGAGAAGGGGTACGAAACGGTCTT', 'GTGTAAAGGCGGCCAAATTTAGGCAGGAGACCTATCATCCCAGTGTTTACCATCTAATGAGTGGATCTTCGACCCAGCAGTGACACTTGCTCATCAGCGC', 'TTAGGTGGTTTTAATACTGGTGGCCGCTCGAACCATGTGCCCAAACTTTGATTCGACATCAGGAAGGCGACTGACACTTGGGGTTCGGAGGGCATGGCTA', 'CTGGGACGCAGCGATAAGAGCTATCGCCGTAATAGACCTTGACCGTATGGAAAAGCAAAAAACGTCCGGTCCTTGAGGACACTCATCGAGATCGGCGAAC', 'ACTGTTCCCGAGAGAGAATAGGATAATAGTCCCCGCTAGGAAGTTCCTAATGCGTTCCTTAGGCAGATGCTAGTCGTTGCACGCCATTTAGTCGCGGCAA']
Modified codewords: ['GTTGCTCGCATGATTTACCAGCATTCATTAAGAGCGGGACTTTCGTGCCCGTATGTGGTTGGTTTCTAGAGGAATCGGAGAAGGGGTACGAAACGGTCTT', 'GTGTAAAGGCGGCCAAATTTAGGCAGGAGACCTATCATCCCAGTGTTTACCATCTAATGAGTGGATCTTCGACCCAGCAGTGACACTGCTCATCAAGCGC', 'TTAGGTGGTTTTAATACCTGGTGGCCGCTCGAACCATGTGCACCAAACCTTTGATTCGACATCAGGAAGGCGACTGACACTTGGGGTTCGGAGGGCATGGCTA', 'CTGGGACGCAGCGATAAGAGCTATCGCCGTAATAGACCTTGACCGTATGGAAAAGCAAAAAACGTCCGGTCCTTGAGGACACTCGTCGAGTTCGGCGAAC', 'ACTGTTCCCGAGAGAGAA

In [20]:
len(codewords[1])

100

In [21]:
len(result[1])

100