In [1]:
!curl -o Uniref50.fasta https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13.7G  100 13.7G    0     0  15.4M      0  0:15:08  0:15:08 --:--:-- 17.9M  13.6M      0  0:17:05  0:00:23  0:16:42 13.3M04  0:00:33  0:16:31 13.7M3.9M      0  0:16:44  0:00:44  0:16:00 14.7M4.8M      0  0:15:45  0:01:01  0:14:44 18.0M0  14.9M      0  0:15:42  0:01:03  0:14:39 17.9M  0:01:11  0:14:03 17.7M5.4M      0  0:15:11  0:01:14  0:13:57 17.1M  0  15.9M      0  0:14:41  0:01:47  0:12:54 18.6M 0:02:13  0:12:14 18.9M0     0  16.9M      0  0:13:47  0:02:52  0:10:55 15.2M 0     0  16.9M      0  0:13:50  0:03:28  0:10:22 19.0M 17.0M      0  0:13:41  0:03:58  0:09:43 19.3M      0  0:13:37  0:04:11  0:09:26 18.5M 0  0:13:38  0:04:22  0:09:16 16.1M.1M      0  0:13:38  0:04:31  0:09:07 17.6M  0     0  17.0M      0  0:13:43  0:05:18  0:08:25 17.5M  0:05:36  0:08:11 15.7M  17.0M      0  0:13:45  0:05:45  0:08:00 19.1M0:05:56  0:07:

In [8]:
def select_proteins_from_fasta(filepath, num_areas=5, proteins_per_area=20, max_length=100, total_proteins=100):
    """
    Selects up to `total_proteins` proteins with sequence length < max_length,
    sampling sequentially from `num_areas` locations in the file.
    """
    # First pass: record sequence lengths
    lengths = []
    with open(filepath, 'r') as f:
        name = None
        seq = ''
        for line in f:
            if line.startswith('>'):
                if name is not None:
                    lengths.append(len(seq))
                name = line[1:].strip()
                seq = ''
            else:
                seq += line.strip()
        if name is not None:
            lengths.append(len(seq))

    # Filter for proteins < max_length
    valid_indices = [i for i, l in enumerate(lengths) if l < max_length]
    if len(valid_indices) < total_proteins:
        raise ValueError(f"Not enough proteins < {max_length} residues in file.")

    # Divide valid indices into num_areas, keep sampling until 100
    area_size = len(valid_indices) // num_areas
    selected_indices = []
    area_start = 0
    while len(selected_indices) < total_proteins:
        for area in range(num_areas):
            start = area_start + area * area_size
            end = min(start + proteins_per_area, len(valid_indices))
            to_add = valid_indices[start:end]
            selected_indices.extend(to_add)
            if len(selected_indices) >= total_proteins:
                selected_indices = selected_indices[:total_proteins]
                break
        area_start += proteins_per_area
        if area_start >= area_size:
            break

    # Second pass: extract selected proteins
    selected_set = set(selected_indices)
    selected_names = []
    selected_seqs = []
    with open(filepath, 'r') as f:
        idx = -1
        name = None
        seq = ''
        for line in f:
            if line.startswith('>'):
                if name is not None:
                    idx += 1
                    if idx in selected_set:
                        selected_names.append(name)
                        selected_seqs.append(seq)
                name = line[1:].strip().replace(' ', '_')
                seq = ''
            else:
                seq += line.strip()
        if name is not None:
            idx += 1
            if idx in selected_set:
                selected_names.append(name)
                selected_seqs.append(seq)
    return selected_names, selected_seqs

In [9]:
# Example usage:
selected_names, selected_seqs = select_proteins_from_fasta('Uniref50.fasta')
print(f"Selected {len(selected_seqs)} proteins from 5 areas.")

Selected 100 proteins from 5 areas.


In [None]:
def save_to_fasta(names, seqs, out_filepath):
    with open(out_filepath, 'w') as f:
        for name, seq in zip(names, seqs):
            f.write(f">{name}\n")
            # Optionally wrap sequence lines at 60 chars for readability
            for i in range(0, len(seq), 60):
                f.write(seq[i:i+60] + "\n")

save_to_fasta(selected_names, selected_seqs, "selected_proteins.fasta")