# Genome Assembly: 4. Contigs and Scaffolding

## Overview
This notebook covers the post-graph steps to turn De Bruijn graphs into assembled genomes:

1. **Contigs**: Linear sequences from unambiguous graph regions
2. **Paired-end reads**: Using read pairs to resolve ambiguities and span repeats
3. **Scaffolding**: Linking contigs using pair distance information
4. **Gap filling**: Closing gaps between scaffolded contigs

**Key insight**: Paired-end reads tell us that two sequences are nearby in the genome, even if we can't directly sequence the region between them.

## 1. Contig Extraction from De Bruijn Graphs

In [None]:
import numpy as np
from collections import defaultdict, deque
from typing import List, Tuple, Set, Dict, Optional
import math

class ContigExtractor:
    """
    Extract contigs from a De Bruijn graph.
    A contig is a linear path where all internal nodes have in-degree = out-degree = 1.
    """
    
    def __init__(self, k: int):
        self.k = k
        self.graph = defaultdict(list)  # node -> [(next_node, edge_label, count)]
        self.reverse_graph = defaultdict(list)  # for finding in-edges
        self.in_degree = defaultdict(int)
        self.out_degree = defaultdict(int)
        self.coverage = defaultdict(float)
    
    def add_edge(self, prefix: str, suffix: str, base: str, count: int = 1):
        """
        Add an edge to the graph (represents a k-mer).
        """
        self.graph[prefix].append((suffix, base, count))
        self.reverse_graph[suffix].append((prefix, base, count))
        self.out_degree[prefix] += count
        self.in_degree[suffix] += count
    
    def extract_contigs(self) -> List[Tuple[str, float]]:
        """
        Extract all contigs from the graph.
        Returns list of (sequence, avg_coverage) tuples.
        """
        contigs = []
        visited_edges = set()
        
        # Find all possible starting points
        all_nodes = set(self.graph.keys()) | set(self.reverse_graph.keys())
        
        for start_node in all_nodes:
            # Start from nodes with in-degree = 0 or in-degree != out-degree
            in_deg = self.in_degree[start_node]
            out_deg = self.out_degree[start_node]
            
            if out_deg == 0:
                continue  # Dead end, can't start here
            
            # Try each outgoing edge from this node
            for next_node, base, count in self.graph[start_node]:
                edge_key = (start_node, next_node, base)
                
                if edge_key in visited_edges:
                    continue
                
                # Trace the contig from here
                contig, coverage = self._trace_contig(start_node, next_node, base, count, visited_edges)
                
                if contig:
                    contigs.append((contig, coverage))
        
        return contigs
    
    def _trace_contig(self, start: str, next_node: str, first_base: str, first_count: int,
                     visited: Set) -> Tuple[str, float]:
        """
        Trace a linear path (contig) through the graph.
        Stops when reaching a branching point or cycle.
        """
        sequence = start + first_base
        current = next_node
        counts = [first_count]
        visited.add((start, next_node, first_base))
        
        # Continue while path is linear (in-degree = out-degree = 1)
        while True:
            in_deg = self.in_degree[current]
            out_deg = self.out_degree[current]
            
            if in_deg == 0 or out_deg == 0 or out_deg > 1:
                break  # Branching or dead end
            
            # Get the next edge
            if not self.graph[current]:
                break
            
            next_next, next_base, count = self.graph[current][0]
            edge_key = (current, next_next, next_base)
            
            if edge_key in visited:
                break  # Would create a cycle
            
            sequence += next_base
            counts.append(count)
            visited.add(edge_key)
            current = next_next
        
        # Calculate average coverage from edge counts
        avg_coverage = np.mean(counts) if counts else 0
        
        return sequence, avg_coverage
    
    def filter_contigs(self, contigs: List[Tuple[str, float]], 
                      min_length: int = 100, min_coverage: float = 2.0) -> List[Tuple[str, float]]:
        """
        Filter contigs by length and coverage.
        Short, low-coverage contigs are likely errors.
        """
        filtered = []
        for seq, cov in contigs:
            if len(seq) >= min_length and cov >= min_coverage:
                filtered.append((seq, cov))
        
        return filtered

# Test contig extraction
print("Contig Extraction from De Bruijn Graph")
print("="*70)

# Create a simple test graph
extractor = ContigExtractor(k=21)

# Add edges representing a simple genome
test_sequence = "ATGCGATCGATCGATCGATCG"
for i in range(len(test_sequence) - 20):
    prefix = test_sequence[i:i+20]
    suffix = test_sequence[i+1:i+21]
    base = test_sequence[i+20]
    count = 5  # 5x coverage
    extractor.add_edge(prefix, suffix, base, count)

contigs = extractor.extract_contigs()
contigs = extractor.filter_contigs(contigs, min_length=10, min_coverage=1)

print(f"Test sequence: {test_sequence}")
print(f"\nExtracted {len(contigs)} contig(s):")
for i, (seq, cov) in enumerate(contigs):
    print(f"  Contig {i+1}: {seq}")
    print(f"    Length: {len(seq)} bp, Avg coverage: {cov:.1f}x")
    if seq in test_sequence:
        print(f"    ✓ Matches original sequence")

## 2. Paired-End Reads and Insert Size

In [None]:
class PairedEndRead:
    """
    Represents a pair of reads from a DNA fragment.
    Key: We know the distance between read 1 and read 2 (insert size).
    """
    
    def __init__(self, read1_id: str, read1_seq: str, read2_id: str, read2_seq: str,
                 insert_size_mean: int = 500, insert_size_std: int = 50):
        self.read1_id = read1_id
        self.read1_seq = read1_seq
        self.read2_id = read2_id
        self.read2_seq = read2_seq
        self.insert_size_mean = insert_size_mean
        self.insert_size_std = insert_size_std
    
    def get_span(self, read1_len: int, read2_len: int) -> Tuple[int, int]:
        """
        Get the expected minimum and maximum genomic distance between read 1 end and read 2 start.
        span_min = insert_size - std - read_lengths
        span_max = insert_size + std - read_lengths
        """
        min_span = max(0, self.insert_size_mean - 3*self.insert_size_std - read1_len - read2_len)
        max_span = self.insert_size_mean + 3*self.insert_size_std - read1_len - read2_len
        return min_span, max_span

def generate_paired_reads(sequence: str, read_length: int = 100, insert_size_mean: int = 500,
                         insert_size_std: int = 50, coverage: int = 10, error_rate: float = 0.01,
                         seed: int = 42) -> List[PairedEndRead]:
    """
    Generate paired-end reads from a sequence.
    """
    import random
    random.seed(seed)
    
    num_pairs = (len(sequence) * coverage) // (2 * read_length)
    pairs = []
    bases = ['A', 'T', 'G', 'C']
    
    for i in range(num_pairs):
        # Random fragment position
        fragment_start = random.randint(0, len(sequence) - insert_size_mean)
        
        # Read 1 (forward)
        read1_seq = sequence[fragment_start:fragment_start + read_length]
        
        # Read 2 (reverse complement from other end)
        read2_start = fragment_start + insert_size_mean - read_length
        read2_seq = sequence[read2_start:read2_start + read_length]
        
        # Introduce errors
        def add_errors(seq, rate):
            seq_list = list(seq)
            for j in range(len(seq_list)):
                if random.random() < rate:
                    seq_list[j] = random.choice([b for b in bases if b != seq_list[j]])
            return ''.join(seq_list)
        
        read1_seq = add_errors(read1_seq, error_rate)
        read2_seq = add_errors(read2_seq, error_rate)
        
        pair = PairedEndRead(
            f"read{i}/1", read1_seq,
            f"read{i}/2", read2_seq,
            insert_size_mean, insert_size_std
        )
        pairs.append(pair)
    
    return pairs

# Test paired-end generation
print("\nPaired-End Reads")
print("="*70)

test_seq = "ATGCGATCGATCGATCGATCG" * 30  # 660 bp
test_pairs = generate_paired_reads(test_seq, read_length=100, insert_size_mean=500,
                                insert_size_std=50, coverage=5)

print(f"Generated {len(test_pairs)} paired-end read pairs")
print(f"\nExample pair:")
pair = test_pairs[0]
print(f"  Read 1: {pair.read1_seq[:50]}... ({len(pair.read1_seq)}bp)")
print(f"  Read 2: {pair.read2_seq[:50]}... ({len(pair.read2_seq)}bp)")
print(f"  Insert size: {pair.insert_size_mean}bp ± {pair.insert_size_std}bp")
min_span, max_span = pair.get_span(len(pair.read1_seq), len(pair.read2_seq))
print(f"  Expected gap between reads: {min_span}bp to {max_span}bp")

## 3. Scaffolding: Linking Contigs with Pair Information

In [None]:
class ScaffoldLink:
    """
    Represents a connection between two contigs based on paired reads.
    """
    
    def __init__(self, contig1_id: int, contig2_id: int, 
                 num_pairs: int, gap_estimate: float, gap_std: float,
                 orientation: str = '+-'):
        self.contig1_id = contig1_id
        self.contig2_id = contig2_id
        self.num_pairs = num_pairs  # Supporting pairs
        self.gap_estimate = gap_estimate  # Estimated gap size
        self.gap_std = gap_std
        self.orientation = orientation  # '++', '+-', '-+', '--'
    
    def is_confident(self, min_support: int = 5) -> bool:
        """Link is confident if supported by enough read pairs."""
        return self.num_pairs >= min_support

class Scaffolder:
    """
    Build scaffolds from contigs using paired-end information.
    
    Algorithm:
    1. Map reads to contigs
    2. Find pairs where read1 and read2 map to different contigs
    3. Calculate gap size from insert size and contig positions
    4. Build graph of contigs connected by pairs
    5. Find linear paths (scaffolds)
    """
    
    def __init__(self, contigs: List[str]):
        self.contigs = contigs  # List of contig sequences
        self.contig_positions = self._build_contig_map()
        self.scaffold_links = []
    
    def _build_contig_map(self) -> Dict[int, Dict]:
        """
        Build a map of contig IDs to their sequences and lengths.
        """
        positions = {}
        for i, contig in enumerate(self.contigs):
            positions[i] = {'seq': contig, 'length': len(contig)}
        return positions
    
    def find_read_positions(self, read_seq: str, contig_id: int, 
                           min_match: int = 20) -> Optional[Tuple[int, int, bool]]:
        """
        Find where a read maps to a contig.
        Returns (start_position, end_position, is_reverse_complement).
        Simple exact/near-exact matching.
        """
        contig = self.contig_positions[contig_id]['seq']
        
        # Forward strand
        for i in range(len(contig) - min_match + 1):
            if contig[i:].startswith(read_seq[:min_match]):
                return (i, i + len(read_seq), False)
        
        # Reverse complement (simplified: just check reverse)
        read_rev = read_seq[::-1]
        for i in range(len(contig) - min_match + 1):
            if contig[i:].startswith(read_rev[:min_match]):
                return (i, i + len(read_seq), True)
        
        return None
    
    def build_scaffold_links(self, pairs: List[PairedEndRead], min_support: int = 5):
        """
        Build scaffold links by finding pairs that map to different contigs.
        """
        pair_links = defaultdict(list)  # (contig1, contig2) -> list of (gap_size, support_count)
        
        for pair in pairs:
            positions1 = None
            positions2 = None
            contig1_id = None
            contig2_id = None
            
            # Find mapping for read 1
            for cid in range(len(self.contigs)):
                pos = self.find_read_positions(pair.read1_seq, cid, min_match=15)
                if pos:
                    positions1 = pos
                    contig1_id = cid
                    break
            
            # Find mapping for read 2
            for cid in range(len(self.contigs)):
                pos = self.find_read_positions(pair.read2_seq, cid, min_match=15)
                if pos:
                    positions2 = pos
                    contig2_id = cid
                    break
            
            # If on different contigs, record the link
            if (positions1 and positions2 and contig1_id is not None and 
                contig2_id is not None and contig1_id != contig2_id):
                
                # Calculate expected gap
                read1_end = positions1[1]
                read2_start = positions2[0]
                gap = pair.insert_size_mean - len(pair.read1_seq) - len(pair.read2_seq)
                
                key = tuple(sorted([contig1_id, contig2_id]))
                pair_links[key].append(gap)
        
        # Aggregate links
        for (c1, c2), gaps in pair_links.items():
            if len(gaps) >= min_support:
                gap_mean = np.mean(gaps)
                gap_std = np.std(gaps)
                link = ScaffoldLink(c1, c2, len(gaps), gap_mean, gap_std)
                self.scaffold_links.append(link)
    
    def build_scaffolds(self) -> List[List[int]]:
        """
        Build scaffolds (linear sequences of contigs) from links.
        """
        # Build adjacency graph
        adj = defaultdict(list)
        for link in self.scaffold_links:
            adj[link.contig1_id].append(link.contig2_id)
            adj[link.contig2_id].append(link.contig1_id)
        
        # Find linear paths
        scaffolds = []
        visited = set()
        
        for start in range(len(self.contigs)):
            if start in visited or len(adj[start]) == 0:
                continue
            
            # Find the path starting from this node
            # (simplified: just follow single edges)
            path = [start]
            visited.add(start)
            current = start
            
            while len(adj[current]) > 0:
                next_node = None
                for neighbor in adj[current]:
                    if neighbor not in visited:
                        next_node = neighbor
                        break
                
                if next_node is None:
                    break
                
                path.append(next_node)
                visited.add(next_node)
                current = next_node
            
            scaffolds.append(path)
        
        return scaffolds

# Test scaffolding
print("\nScaffolding Example")
print("="*70)

# Create test contigs (broken by repeats)
contig1 = "ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"
contig2 = "GGCGTAGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA"
contig3 = "TTACGTACGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT"
contigs = [contig1, contig2, contig3]

print(f"Test contigs:")
for i, c in enumerate(contigs):
    print(f"  Contig {i}: {len(c)}bp")

# Scaffold them
scaffolder = Scaffolder(contigs)
scaffolder.build_scaffold_links(test_pairs, min_support=2)

print(f"\nFound {len(scaffolder.scaffold_links)} scaffold links")
for link in scaffolder.scaffold_links:
    print(f"  Contig {link.contig1_id} <-> Contig {link.contig2_id}: "
          f"{link.num_pairs} pairs, gap={link.gap_estimate:.0f}±{link.gap_std:.0f}bp")

scaffolds = scaffolder.build_scaffolds()
print(f"\nBuilt {len(scaffolds)} scaffold(s):")
for i, scaffold in enumerate(scaffolds):
    print(f"  Scaffold {i}: Contigs {scaffold}")

## 4. Gap Filling

In [None]:
class GapFiller:
    """
    Fill gaps between scaffolded contigs.
    
    Strategies:
    1. Assemble reads that map across the gap
    2. Use path-finding in the De Bruijn graph
    3. Interpolate with N's if assembly fails
    """
    
    def __init__(self, reads: List[str], k: int = 21):
        self.reads = reads
        self.k = k
        self.kmers = self._build_kmer_map()
    
    def _build_kmer_map(self) -> Dict[str, int]:
        """
        Build a map of k-mers to their frequencies.
        """
        kmer_map = defaultdict(int)
        for read in self.reads:
            for i in range(len(read) - self.k + 1):
                kmer = read[i:i+self.k]
                kmer_map[kmer] += 1
        return kmer_map
    
    def find_path(self, start_kmer: str, end_kmer: str, max_length: int = 1000,
                 min_coverage: int = 2) -> Optional[str]:
        """
        Find a path in the De Bruijn graph from start_kmer to end_kmer.
        Uses BFS with coverage constraints.
        """
        if start_kmer not in self.kmers or end_kmer not in self.kmers:
            return None
        
        if self.kmers[start_kmer] < min_coverage or self.kmers[end_kmer] < min_coverage:
            return None  # Low coverage k-mers, likely errors
        
        # BFS from start_kmer
        queue = deque([(start_kmer, start_kmer)])
        visited = {start_kmer}
        
        while queue:
            current_path, current_kmer = queue.popleft()
            
            if len(current_path) > max_length + self.k:
                continue  # Path getting too long
            
            # Generate next k-mers by extending one base
            for base in ['A', 'T', 'G', 'C']:
                next_kmer = current_kmer[1:] + base
                
                if next_kmer in visited:
                    continue
                if self.kmers[next_kmer] < min_coverage:
                    continue
                
                new_path = current_path + base
                
                # Check if we reached the end
                if next_kmer == end_kmer:
                    return new_path
                
                visited.add(next_kmer)
                queue.append((new_path, next_kmer))
        
        return None
    
    def fill_gap(self, contig1: str, contig2: str, gap_size_estimate: int,
                gap_size_std: float) -> Optional[str]:
        """
        Attempt to fill a gap between two contigs.
        """
        # Get end of contig1 and start of contig2 as anchors
        end_anchor = contig1[-self.k:]
        start_anchor = contig2[:self.k]
        
        # Check if they're directly connected
        if end_anchor[1:] == start_anchor[:-1]:
            # Contigs are directly adjacent
            return ""
        
        # Search for a path
        max_search = int(gap_size_estimate + 3*gap_size_std + 2*self.k)
        gap_sequence = self.find_path(end_anchor, start_anchor, max_length=max_search)
        
        return gap_sequence

# Test gap filling
print("\nGap Filling")
print("="*70)

# Create contigs with a gap
seq1 = "ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"
seq_gap = "TTTTAAAATTTTAAAA"  # True gap sequence
seq2 = "GATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"

full_seq = seq1 + seq_gap + seq2

# Generate reads from the full sequence
gap_reads = generate_paired_reads(full_seq, read_length=50, coverage=10)[0]
all_reads_gap = [gap_reads.read1_seq for gap_reads in generate_paired_reads(full_seq, read_length=50, coverage=10)]
all_reads_gap += [gap_reads.read2_seq for gap_reads in generate_paired_reads(full_seq, read_length=50, coverage=10)]

print(f"Contig 1 (end): ...{seq1[-30:]}")
print(f"True gap:       {seq_gap}")
print(f"Contig 2 (start): {seq2[:30]}...")
print(f"\nTotal gap size: {len(seq_gap)} bp")

filler = GapFiller(all_reads_gap, k=15)
filled_gap = filler.fill_gap(seq1, seq2, len(seq_gap), len(seq_gap) * 0.1)

if filled_gap:
    print(f"\nFilled gap: {filled_gap}")
    if filled_gap == seq_gap:
        print("✓ Perfect match!")
    else:
        print(f"✗ Gap mismatch (got {len(filled_gap)}bp instead of {len(seq_gap)}bp)")
else:
    print("Gap filling failed - would use N's as placeholder")
    print(f"Placeholder: {seq1}{'N'*len(seq_gap)}{seq2}")

## 5. Complete Assembly Pipeline

In [None]:
def write_fasta(contigs: List[Tuple[str, str]], filename: str):
    """
    Write contigs to FASTA file.
    """
    with open(filename, 'w') as f:
        for header, seq in contigs:
            f.write(f">{header}\n")
            # Write sequence in 80bp lines
            for i in range(0, len(seq), 80):
                f.write(seq[i:i+80] + '\n')

def assemble_complete(reads: List[str], pairs: List[PairedEndRead], 
                     k: int = 21, output_prefix: str = "assembly") -> dict:
    """
    Complete assembly pipeline.
    """
    stats = {}
    
    print(f"Complete Assembly Pipeline")
    print("="*70)
    print(f"\nInput:")
    print(f"  Reads: {len(reads)}")
    print(f"  Pairs: {len(pairs)}")
    
    # Step 1: Build De Bruijn graph and extract contigs
    print(f"\nStep 1: Extract contigs from De Bruijn graph...")
    extractor = ContigExtractor(k=k)
    
    from collections import Counter
    for read in reads:
        for i in range(len(read) - k + 1):
            kmer = read[i:i+k]
            prefix = kmer[:-1]
            suffix = kmer[1:]
            base = kmer[-1]
            extractor.add_edge(prefix, suffix, base, count=1)
    
    raw_contigs = extractor.extract_contigs()
    contigs = extractor.filter_contigs(raw_contigs, min_length=100, min_coverage=1.0)
    print(f"  Generated {len(raw_contigs)} raw contigs")
    print(f"  Filtered to {len(contigs)} high-quality contigs")
    stats['raw_contigs'] = len(raw_contigs)
    stats['filtered_contigs'] = len(contigs)
    
    # Step 2: Scaffold with paired reads
    print(f"\nStep 2: Build scaffolds with paired-end reads...")
    contig_seqs = [seq for seq, _ in contigs]
    scaffolder = Scaffolder(contig_seqs)
    scaffolder.build_scaffold_links(pairs, min_support=3)
    scaffolds = scaffolder.build_scaffolds()
    print(f"  Found {len(scaffolder.scaffold_links)} scaffold links")
    print(f"  Built {len(scaffolds)} scaffolds")
    stats['scaffolds'] = len(scaffolds)
    
    # Step 3: Gap filling
    print(f"\nStep 3: Fill gaps...")
    final_sequences = []
    for scaffold in scaffolds:
        if len(scaffold) == 1:
            final_sequences.append(contig_seqs[scaffold[0]])
        else:
            # Join contigs with N's (simplified, no actual gap filling)
            parts = [contig_seqs[cid] for cid in scaffold]
            joined = 'NNNNNNNNNN'.join(parts)
            final_sequences.append(joined)
    
    print(f"  Generated {len(final_sequences)} final sequences")
    stats['final_sequences'] = len(final_sequences)
    
    # Write output
    fasta_output = [(f"contig_{i}", seq) for i, seq in enumerate(final_sequences)]
    fasta_file = f"{output_prefix}.fasta"
    write_fasta(fasta_output, fasta_file)
    print(f"\nOutput written to {fasta_file}")
    
    # Summary statistics
    total_assembled = sum(len(seq) for seq in final_sequences)
    print(f"\nSummary:")
    print(f"  Total length: {total_assembled} bp")
    print(f"  Number of sequences: {len(final_sequences)}")
    print(f"  N50 (rough): {np.median([len(s) for s in final_sequences])} bp")
    stats['total_length'] = total_assembled
    stats['n50_estimate'] = np.median([len(s) for s in final_sequences])
    
    return stats

# Run complete assembly
print("\n" + "="*70)
assembly_reads = [r.read1_seq for r in test_pairs] + [r.read2_seq for r in test_pairs]
assembly_stats = assemble_complete(assembly_reads, test_pairs, k=15, output_prefix="test_assembly")
print(f"\nAssembly complete!")

## 6. Output Formats

In [None]:
print("\nCommon Assembly Output Formats")
print("="*70)

print("""
1. FASTA Format (.fasta or .fa)
   - Simple text format with sequences
   - Header line starts with >
   - Sequences in 80bp lines (convention)
   
   Example:
   > contig_1 len=5000
   ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATC
   ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG
   ...

2. FASTA Quality (.fastq or .qual)
   - FASTA format with quality scores
   - Header, sequence, +, quality scores

3. Scaffold File (.scf or .agp)
   - Documents how contigs are linked
   - Shows gaps and overlaps
   - Format: scaffold_id contig_start contig_end contig_id contig_start contig_end orientation

4. GFF/GTF (with assembly info)
   - Can annotate contigs with features
   - Tracks which reads/contigs contributed

5. Sam/BAM
   - Maps original reads back to assembly
   - Useful for coverage analysis
""")

print("\nReading assembly files:")
print("="*70)

def read_fasta(filename: str) -> List[Tuple[str, str]]:
    """
    Read a FASTA file and return list of (header, sequence) tuples.
    """
    sequences = []
    current_header = None
    current_seq = []
    
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            
            if line.startswith('>'):
                # Save previous sequence
                if current_header is not None:
                    sequences.append((current_header, ''.join(current_seq)))
                
                current_header = line[1:]
                current_seq = []
            else:
                current_seq.append(line)
        
        # Save last sequence
        if current_header is not None:
            sequences.append((current_header, ''.join(current_seq)))
    
    return sequences

try:
    read_seqs = read_fasta("test_assembly.fasta")
    print(f"\nRead {len(read_seqs)} sequences from test_assembly.fasta")
    for header, seq in read_seqs:
        print(f"  {header}: {len(seq)} bp")
except FileNotFoundError:
    print("test_assembly.fasta not found (expected if assembly step was skipped)")

## Summary

**This notebook covered:**
- ✓ Contig extraction from De Bruijn graphs
- ✓ Paired-end reads and insert size estimation
- ✓ Scaffolding: using pairs to link contigs
- ✓ Gap filling: reconstructing sequences between contigs
- ✓ Output formats (FASTA, etc.)

**Key insights:**
- Paired-end reads resolve ambiguities that single reads can't
- Scaffolding creates longer sequences from shorter contigs
- Gap filling is challenging and often results in placeholders (N's)
- Modern assemblers often produce scaffolds, not complete genomes

**Next notebook (5) will cover:**
- Assembly quality metrics (N50, L50, etc.)
- Comparing assemblies
- Common issues and troubleshooting
- When to use different assemblers