In [1]:
# Install BLAST+ command-line tools and biopython
!apt-get install -y ncbi-blast+
!pip install biopython


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  liblmdb0 ncbi-data
The following NEW packages will be installed:
  liblmdb0 ncbi-blast+ ncbi-data
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 15.9 MB of archives.
After this operation, 71.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 liblmdb0 amd64 0.9.24-1build2 [47.6 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ncbi-data all 6.1.20170106+dfsg1-9 [3,519 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ncbi-blast+ amd64 2.12.0+ds-3build1 [12.3 MB]
Fetched 15.9 MB in 1s (12.3 MB/s)
Selecting previously unselected package liblmdb0:amd64.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../liblmdb0_0.9.24-1build2_amd64.deb ...
Unpacking liblmdb0:amd64 (0.9.24-1build2) ...
Selec

In [2]:
#Download and setup a small BLAST protein database
# Here we use the example "nr" database. For practical purposes, download a smaller database if necessary.
!wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.00.tar.gz
!tar -xzvf nr.00.tar.gz

--2024-05-30 10:42:39--  ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.00.tar.gz
           => ‘nr.00.tar.gz’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 130.14.250.13, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /blast/db ... done.
==> SIZE nr.00.tar.gz ... 32533643802
==> PASV ... done.    ==> RETR nr.00.tar.gz ... done.
Length: 32533643802 (30G) (unauthoritative)


2024-05-30 11:06:46 (21.5 MB/s) - Control connection closed.
Retrying.

--2024-05-30 11:21:48--  ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.00.tar.gz
  (try: 2) => ‘nr.00.tar.gz’
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /blast/db ... done.
==> SIZE nr.00.tar.gz ..

In [None]:
#Upload Your Protein FASTA File to Colab
from google.colab import files

# Upload the FASTA file
uploaded = files.upload()

# Get the filename
fasta_file = next(iter(uploaded))


In [None]:
#Run the Python Script to Perform BLAST Searches
import os
from Bio.Blast.Applications import NcbiblastpCommandline
import subprocess

def batch_blast(fasta_file, output_dir, blast_db="nr"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Parse the FASTA file
    sequences = parse_fasta(fasta_file)

    for seq_id, sequence in sequences.items():
        output_file = os.path.join(output_dir, f"{seq_id}.xml")

        # Create a temporary FASTA file for each sequence
        temp_fasta = f"{seq_id}.fasta"
        with open(temp_fasta, 'w') as f:
            f.write(f">{seq_id}\n{sequence}\n")

        # Run BLAST
        blastp_cline = f"blastp -query {temp_fasta} -db {blast_db} -out {output_file} -outfmt 5"
        process = subprocess.run(blastp_cline, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if process.returncode != 0:
            print(f"Error running BLAST for {seq_id}: {process.stderr.decode('utf-8')}")

        # Clean up the temporary FASTA file
        os.remove(temp_fasta)

def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

# Specify the output directory
output_dir = "blast_results"

# Run the batch BLAST function
batch_blast(fasta_file, output_dir)

# Zip the results directory for download
!zip -r blast_results.zip blast_results


In [None]:
# Specify the output directory
output_dir = "blast_results"

# Run the batch BLAST function
batch_blast(fasta_file, output_dir)

# Zip the results directory for download
!zip -r blast_results.zip blast_results

# Step 5: Download the BLAST Results
# Download the zip file
files.download("blast_results.zip")
