<a href="https://colab.research.google.com/github/Monika735503/BIOINFORMATICS/blob/main/Restriction_mapping_and_Primer_Designing_expt_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import re
from google.colab import files

# -------------------------------
# Step 1: Upload ZIP file
# -------------------------------
print("Upload your ZIP file containing the gene sequence (PSEN1_datasets.zip)")
uploaded = files.upload()
zip_path = next(iter(uploaded))  # Get the uploaded file name
print("Uploaded file:", zip_path)

# -------------------------------
# Step 2: Extract FASTA from ZIP
# -------------------------------
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    files_in_zip = zip_ref.namelist()
    print("Files in ZIP:", files_in_zip)

    # Locate the gene.fna file
    fasta_file_name = [f for f in files_in_zip if f.endswith("gene.fna")][0]
    with zip_ref.open(fasta_file_name) as fasta_file:
        fasta_content = fasta_file.read().decode('utf-8')

# -------------------------------
# Step 3: Parse FASTA ignoring metadata
# -------------------------------
def parse_fasta(fasta_str):
    lines = fasta_str.splitlines()
    seq_lines = []
    descriptor = None
    for line in lines:
        line = line.strip()
        if not line or line.startswith("#"):
            continue  # Skip empty lines or metadata
        if line.startswith(">"):
            descriptor = line
            continue
        seq_lines.append(line)
    seq = ''.join(seq_lines).upper()
    return descriptor, seq

descriptor, seq = parse_fasta(fasta_content)
print(f"\nDescriptor: {descriptor}")
print(f"Sequence length: {len(seq)}")

# -------------------------------
# Step 4: Restriction mapping function
# -------------------------------
def restriction_sites_with_re(seq, recog_seq):
    """Return all positions of a restriction site in the sequence"""
    return [match.start() for match in re.finditer(recog_seq, seq)]

# -------------------------------
# Step 5: Define enzymes
# -------------------------------
enzymes = {
    'HindIII': 'AAGCTT',
    'EcoRI': 'GAATTC',
    'KpnI': 'GGTACC'
}

# -------------------------------
# Step 6: Display restriction sites
# -------------------------------
print("\n--- Restriction Sites ---")
for enzyme, site in enzymes.items():
    positions = restriction_sites_with_re(seq, site)
    print(f"{enzyme}: {positions}")

# -------------------------------
# Step 7: Extract flanking sequences around sites
# -------------------------------
flank_size = 20  # Number of bases upstream and downstream
print("\n--- Flanking sequences around restriction sites ---")

for enzyme, site in enzymes.items():
    positions = restriction_sites_with_re(seq, site)
    for pos in positions:
        start = max(pos - flank_size, 0)
        end = min(pos + len(site) + flank_size, len(seq))
        flanking_seq = seq[start:end]
        print(f"{enzyme} site at {pos}: {flanking_seq}")

# -------------------------------
# Step 8: Design primers for each restriction site
# -------------------------------
def design_primers(flank_seq, primer_len=20):
    """Simple primer design: pick first and last `primer_len` bases."""
    forward_primer = flank_seq[:primer_len]
    reverse_primer = flank_seq[-primer_len:]
    # Reverse complement for reverse primer
    complement = str.maketrans('ATCG', 'TAGC')
    reverse_primer_rc = reverse_primer.translate(complement)[::-1]
    return forward_primer, reverse_primer_rc

print("\n--- Primer sequences for each restriction site ---")
for enzyme, site in enzymes.items():
    positions = restriction_sites_with_re(seq, site)
    for pos in positions:
        start = max(pos - flank_size, 0)
        end = min(pos + len(site) + flank_size, len(seq))
        flank_seq = seq[start:end]
        fwd, rev = design_primers(flank_seq)
        print(f"{enzyme} site at {pos}: Forward: {fwd} | Reverse: {rev}")


Upload your ZIP file containing the gene sequence (PSEN1_datasets.zip)


Saving PSEN1_datasets.zip to PSEN1_datasets (2).zip
Uploaded file: PSEN1_datasets (2).zip
Files in ZIP: ['README.md', 'ncbi_dataset/data/gene.fna', 'ncbi_dataset/data/data_report.jsonl', 'ncbi_dataset/data/dataset_catalog.json', 'md5sum.txt']

Descriptor: >NC_060938.1:67341963-67429152 PSEN1 [organism=Homo sapiens] [GeneID=5663] [chromosome=14]
Sequence length: 174465

--- Restriction Sites ---
HindIII: [1279, 4583, 5822, 10668, 13777, 17053, 23031, 27826, 30896, 32883, 39463, 39640, 46563, 51368, 52308, 56912, 57239, 58231, 62316, 67397, 71650, 74354, 74362, 74659, 80037, 81361, 85782, 88554, 91853, 93092, 97939, 101048, 104319, 110302, 115100, 118167, 120154, 126734, 126911, 133834, 138589, 139529, 144133, 144459, 145450, 149534, 154589, 158841, 161545, 161553, 161850, 167225, 168549, 172972]
EcoRI: [3343, 4291, 6468, 8162, 13038, 18286, 22867, 26776, 28939, 36513, 40011, 51495, 52157, 55351, 67529, 71136, 78220, 80390, 83329, 90614, 91561, 93738, 95432, 100309, 105554, 110138, 11405