# **DNA Sequencing Analysis Project**
this project focuses on analyzing DNA sequencing data to perform various tasks such as finding patterns, identifying problematic sequencing cycles, and more. The project includes implementations of algorithms to address specific questions related to DNA sequencing.

In [1]:
import requests

url = "https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa"
response = requests.get(url)
with open("lambda_virus.fa", "wb") as f:
    f.write(response.content)



In [2]:
def reverse_complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement[base] for base in reversed(seq))

def count_pattern_occurrences(genome, pattern):
    count = 0
    pattern_rc = reverse_complement(pattern)
    for i in range(len(genome) - len(pattern) + 1):
        if genome[i:i+len(pattern)] == pattern or genome[i:i+len(pattern)] == pattern_rc:
            count += 1
    return count

# Read the lambda virus genome sequence
with open("lambda_virus.fa") as f:
    lines = f.readlines()[1:]
    genome = ''.join(line.strip() for line in lines)

# Define the pattern
pattern = "AGGT"

# Count the occurrences of the pattern and its reverse complement
count_aggt = count_pattern_occurrences(genome, pattern)
count_acct = count_pattern_occurrences(genome, reverse_complement(pattern))

# Calculate the total count
total_count = count_aggt + count_acct

print("Total occurrences of AGGT and its reverse complement ACCT:", total_count)


Total occurrences of AGGT and its reverse complement ACCT: 612


In [3]:
def reverse_complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement[base] for base in reversed(seq))

def count_pattern_occurrences(genome, pattern):
    count = 0
    for i in range(len(genome) - len(pattern) + 1):
        if genome[i:i+len(pattern)] == pattern:
            count += 1
    return count

# Read the lambda virus genome sequence
with open("lambda_virus.fa") as f:
    lines = f.readlines()[1:]
    genome = ''.join(line.strip() for line in lines)

# Define the pattern
pattern = "TTAA"

# Count the occurrences of the pattern and its reverse complement
count_ttaa = count_pattern_occurrences(genome, pattern)

# Print the count
print("Total occurrences of TTAA in the lambda virus genome:", count_ttaa)


Total occurrences of TTAA in the lambda virus genome: 195


In [4]:
def reverse_complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement[base] for base in reversed(seq))

# Read the lambda virus genome sequence
with open("lambda_virus.fa") as f:
    lines = f.readlines()[1:]
    genome = ''.join(line.strip() for line in lines)

# Define the pattern
pattern = "ACTAAGT"

# Find the leftmost occurrence of the pattern and its reverse complement
offset_pattern = genome.find(pattern)
offset_rc = genome.find(reverse_complement(pattern))

# Determine the minimum offset
min_offset = min(offset_pattern, offset_rc)

print("Offset of the leftmost occurrence of ACTAAGT or its reverse complement:", min_offset)


Offset of the leftmost occurrence of ACTAAGT or its reverse complement: 26028


In [5]:
def reverse_complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement[base] for base in reversed(seq))

# Read the lambda virus genome sequence
with open("lambda_virus.fa") as f:
    lines = f.readlines()[1:]
    genome = ''.join(line.strip() for line in lines)

# Define the pattern
pattern = "AGTCGA"

# Find the leftmost occurrence of the pattern and its reverse complement
offset_pattern = genome.find(pattern)
offset_rc = genome.find(reverse_complement(pattern))

# Determine the minimum offset
min_offset = min(offset_pattern, offset_rc)

print("Offset of the leftmost occurrence of AGTCGA or its reverse complement:", min_offset)


Offset of the leftmost occurrence of AGTCGA or its reverse complement: 450


In [6]:
def count_mismatches(seq1, seq2):
    return sum(1 for base1, base2 in zip(seq1, seq2) if base1 != base2)

def find_approximate_match(genome, pattern, max_mismatches):
    pattern_length = len(pattern)
    for i in range(len(genome) - pattern_length + 1):
        window = genome[i:i+pattern_length]
        mismatches = count_mismatches(window, pattern)
        if mismatches <= max_mismatches:
            return i
    return -1

# Read the lambda virus genome sequence
with open("lambda_virus.fa") as f:
    lines = f.readlines()[1:]
    genome = ''.join(line.strip() for line in lines)

# Define the pattern
pattern = "AGGAGGTT"

# Find the leftmost occurrence of the pattern with up to 2 mismatches
offset = find_approximate_match(genome, pattern, 2)

print("Offset of the leftmost occurrence of AGGAGGTT with up to 2 mismatches:", offset)


Offset of the leftmost occurrence of AGGAGGTT with up to 2 mismatches: 49


In [7]:
def naive_2mm(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):
        match = True
        mismatches = 0
        for j in range(len(p)):
            if t[i+j] != p[j]:
                mismatches += 1
                if mismatches > 2:
                    match = False
                    break
        if match:
            occurrences.append(i)
    return occurrences

# Read the lambda virus genome sequence
with open("lambda_virus.fa") as f:
    lines = f.readlines()[1:]
    genome = ''.join(line.strip() for line in lines)

# Define the pattern
pattern = "TTCAAGCC"

# Find the occurrences of the pattern with up to 2 mismatches
occurrences = naive_2mm(pattern, genome)

print("Occurrences of TTCAAGCC with up to 2 mismatches:", occurrences)
print("Number of occurrences:", len(occurrences))


Occurrences of TTCAAGCC with up to 2 mismatches: [45, 418, 656, 776, 975, 1311, 1346, 3166, 3265, 3292, 3544, 4237, 4378, 4481, 4518, 5240, 5369, 5597, 5773, 5871, 5971, 6350, 6365, 6564, 6572, 6684, 6731, 6764, 6765, 6929, 7026, 7487, 7783, 7790, 8222, 8251, 8284, 8749, 9078, 9197, 9506, 9942, 10734, 10969, 11211, 11331, 11453, 11640, 11973, 12711, 13670, 13750, 13884, 14687, 14884, 15745, 16293, 16817, 17107, 17349, 17541, 17779, 17838, 18792, 19067, 19719, 19779, 21614, 21854, 22174, 22462, 22722, 22956, 23126, 23522, 23677, 23683, 23820, 23885, 24369, 24584, 24787, 25116, 25554, 26046, 26665, 26887, 27237, 27258, 28166, 28276, 28531, 28639, 28920, 28963, 29021, 29144, 29733, 29921, 30176, 30278, 30362, 30395, 30536, 30721, 30967, 31049, 31379, 31449, 31474, 31482, 31641, 31732, 31749, 31829, 31867, 31872, 32523, 32640, 32754, 32779, 32857, 32977, 33079, 33107, 33371, 33403, 33653, 33686, 33863, 34061, 34194, 34466, 34588, 34733, 34785, 35009, 35156, 35525, 35971, 36074, 36989, 3711

In [8]:
def read_fastq(filename):
    sequences = []
    qualities = []
    with open(filename) as f:
        while True:
            f.readline()  # Skip the header line
            seq = f.readline().strip()  # Read the sequence
            f.readline()  # Skip the placeholder line
            qual = f.readline().strip()  # Read the quality scores
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

def find_bad_cycle(qualities):
    cycle_scores = [0] * len(qualities[0])
    for qual in qualities:
        for i, score in enumerate(qual):
            cycle_scores[i] += ord(score) - 33  # Convert ASCII to Phred quality score
    average_scores = [score / len(qualities) for score in cycle_scores]
    return average_scores.index(min(average_scores))

# Read the FASTQ file
sequences, qualities = read_fastq("ERR037900_1.first1000.fastq")

# Find the problematic sequencing cycle
bad_cycle = find_bad_cycle(qualities)

print("Problematic sequencing cycle:", bad_cycle)


FileNotFoundError: [Errno 2] No such file or directory: 'ERR037900_1.first1000.fastq'