<a href="https://colab.research.google.com/github/Gargi28-sketch/Gargi28-sketch/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!apt-get install -y ncbi-blast+ mafft iqtree
!pip install biopython


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-lato libauthen-sasl-perl libclone-perl libdata-dump-perl
  libencode-locale-perl libfile-listing-perl libfont-afm-perl
  libhtml-form-perl libhtml-format-perl libhtml-parser-perl
  libhtml-tagset-perl libhtml-tree-perl libhttp-cookies-perl
  libhttp-daemon-perl libhttp-date-perl libhttp-message-perl
  libhttp-negotiate-perl libio-html-perl libio-socket-ssl-perl
  liblwp-mediatypes-perl liblwp-protocol-https-perl libmailtools-perl
  libnet-http-perl libnet-smtp-ssl-perl libnet-ssleay-perl libruby3.0
  libtry-tiny-perl liburi-perl libwww-perl libwww-robotrules-perl lynx
  lynx-common ncbi-data netbase perl-openssl-defaults rake ruby
  ruby-net-telnet ruby-rubygems ruby-webrick ruby-xmlrpc ruby3.0
  rubygems-integration
Suggested packages:
  libdigest-hmac-perl libgssapi-perl libcrypt-ssleay-perl libsub-name-perl
  libbusiness-isbn-p

In [2]:
import os
import tarfile
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SearchIO
from shutil import copyfile
from pathlib import Path


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [3]:
# Step 2: Extract genome.tar
tar = tarfile.open("/content/drive/MyDrive/gnoms.tar")
tar.extractall("genomes")
tar.close()

print("Extracted genome contigs to 'genomes/'")


Extracted genome contigs to 'genomes/'


In [4]:
# Step 3: Create BLAST database from genome contigs
genome_dir = "/content/genomes"
blast_db = "genomes_db.fasta"

# Concatenate all fasta files into one for BLAST
with open(blast_db, "w") as outfile:
    for f in Path(genome_dir).rglob("*.fasta"):
        outfile.write(open(f).read())
    for f in Path(genome_dir).rglob("*.fa"):
        outfile.write(open(f).read())

!makeblastdb -in genomes_db.fasta -dbtype nucl -out genomes_db
print("BLAST database created.")




Building a new DB, current time: 07/07/2025 10:36:59
New DB name:   /content/genomes_db
New DB title:  genomes_db.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 29 sequences in 0.375354 seconds.


BLAST database created.


In [5]:
# Step 4: Run BLASTn for gene matching
query_file = "/content/drive/MyDrive/Gene_seq.txt"
output_file = "blast_output.xml"

blastn_cline = NcbiblastnCommandline(query=query_file, db="genomes_db", evalue=1e-5,
                                     outfmt=5, out=output_file, task="blastn", num_threads=2)
stdout, stderr = blastn_cline()
print("BLAST search completed.")


BLAST search completed.


In [6]:
# Step 5: Parse BLAST output and filter results
matches = {}
for result in SearchIO.parse(output_file, "blast-xml"):
    gene_id = result.id
    for hit in result.hits:
        hsp = hit.hsps[0]
        identity = (hsp.ident_num / hsp.aln_span) * 100
        coverage = (hsp.query_span / result.seq_len) * 100
        if identity >= 95 and coverage == 100:
            matches.setdefault(gene_id, []).append({
                "hit_id": hit.id,
                "identity": identity,
                "coverage": coverage,
                "query_start": hsp.query_start,
                "query_end": hsp.query_end,
                "hit_start": hsp.hit_start,
                "hit_end": hsp.hit_end
            })
print("Filtered high-identity matches found:")
for gene, hits in matches.items():
    print(f"{gene}: {len(hits)} hits")


Filtered high-identity matches found:
gapA: 4 hits
infB: 5 hits
mdh: 5 hits
pgi: 5 hits
phoE: 5 hits
rpoB: 5 hits
tonB: 5 hits


In [7]:
# Step 6: Extract matched sequences from genome
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

extracted_dir = "extracted_genes"
os.makedirs(extracted_dir, exist_ok=True)

genome_records = SeqIO.to_dict(SeqIO.parse("genomes_db.fasta", "fasta"))

for gene, hits in matches.items():
    records = []
    for idx, hit in enumerate(hits):
        seq = genome_records[hit['hit_id']].seq[hit['hit_start']:hit['hit_end']]
        if hit['hit_start'] > hit['hit_end']:
            seq = seq.reverse_complement()
        rec_id = f"{gene}_{idx}"
        records.append(SeqRecord(seq, id=rec_id, description=""))
    if records:
        SeqIO.write(records, os.path.join(extracted_dir, f"{gene}.fasta"), "fasta")
print(f"Extracted matching gene sequences saved in '{extracted_dir}/'")


Extracted matching gene sequences saved in 'extracted_genes/'


In [9]:
# Step 7: Align each gene using MAFFT
aligned_dir = "/content/extracted_genes"
os.makedirs(aligned_dir, exist_ok=True)

for gene_file in os.listdir(extracted_dir):
    input_path = os.path.join(extracted_dir, gene_file)
    output_path = os.path.join(aligned_dir, gene_file)
    !mafft --auto {input_path} > {output_path}
print("Gene alignments completed using MAFFT.")


outputhat23=16
treein = 0
compacttree = 0
minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16

Strategy:
 L-INS-i (Probably most accurate, very slow)
 Iterative refinement method (<16) with LOCAL pairwise alignment information

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.

outputhat23=16
treein = 0
compacttree = 0
minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16

Strategy:
 L-INS-i (Probably most accurate, very slow)
 Iterative refinement method (<16) with LOCAL pairw

In [10]:
# Step 8: Concatenate aligned genes for phylogeny
from Bio.Align import MultipleSeqAlignment

gene_order = ['gapA', 'infB', 'mdh', 'pgi', 'phoE', 'rpoB', 'tonB']
combined_records = {}

for gene in gene_order:
    align_path = os.path.join(aligned_dir, f"{gene}.fasta")
    alignment = list(SeqIO.parse(align_path, "fasta"))
    for rec in alignment:
        if rec.id not in combined_records:
            combined_records[rec.id] = ""
        combined_records[rec.id] += str(rec.seq)

# Save concatenated alignment
concat_file = "concatenated_alignment.fasta"
with open(concat_file, "w") as out_f:
    for id, seq in combined_records.items():
        out_f.write(f">{id}\n{seq}\n")
print(f"Concatenated alignment saved as {concat_file}")


Concatenated alignment saved as concatenated_alignment.fasta


In [11]:
!apt-get install -y fasttree


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fasttree
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 174 kB of archives.
After this operation, 499 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fasttree amd64 2.1.11-2 [174 kB]
Fetched 174 kB in 0s (942 kB/s)
Selecting previously unselected package fasttree.
(Reading database ... 130373 files and directories currently installed.)
Preparing to unpack .../fasttree_2.1.11-2_amd64.deb ...
Unpacking fasttree (2.1.11-2) ...
Setting up fasttree (2.1.11-2) ...
Processing triggers for man-db (2.10.2-1) ...


In [12]:
# Build ML tree from your concatenated alignment
!FastTree -nt /content/concatenated_alignment.fasta > ml_tree.nwk


FastTree Version 2.1.11 Double precision (No SSE3)
Alignment: /content/concatenated_alignment.fasta
Nucleotide distances: Jukes-Cantor Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jukes-Cantor, CAT approximation with 20 rate categories
Error reading header line


In [13]:
#Q1) Are all 7 genes present across all the genomes provided?
from collections import defaultdict

# Get a list of genome IDs (based on contig headers)
genome_ids = set()
for record in SeqIO.parse("/content/genomes_db.fasta", "fasta"):
    genome_ids.add(record.id.split("_")[0])  # adjust if ID format differs

# Check if each gene is found in each genome
gene_presence = defaultdict(set)

for gene, hits in matches.items():
    for hit in hits:
        genome_id = hit["hit_id"].split("_")[0]
        gene_presence[gene].add(genome_id)

print("Gene presence summary:")
for gene in ['gapA', 'infB', 'mdh', 'pgi', 'phoE', 'rpoB', 'tonB']:
    genomes_with_gene = gene_presence[gene]
    print(f"{gene}: Present in {len(genomes_with_gene)} / {len(genome_ids)} genomes")


Gene presence summary:
gapA: Present in 4 / 5 genomes
infB: Present in 5 / 5 genomes
mdh: Present in 5 / 5 genomes
pgi: Present in 5 / 5 genomes
phoE: Present in 5 / 5 genomes
rpoB: Present in 5 / 5 genomes
tonB: Present in 5 / 5 genomes


In [14]:
#Q2) Can you comment upon the presence of paralogs for these genes?
# Count number of hits per gene per genome
paralog_check = defaultdict(lambda: defaultdict(int))

for gene, hits in matches.items():
    for hit in hits:
        genome_id = hit["hit_id"].split("_")[0]
        paralog_check[gene][genome_id] += 1

# Report genomes with multiple copies of any gene
print("Paralog presence:")
for gene in paralog_check:
    for genome_id, count in paralog_check[gene].items():
        if count > 1:
            print(f"{gene} has {count} copies in genome {genome_id}")


Paralog presence:


In [16]:
#Can you write a program to identify all genes other than the
#7 genes listed that are conserved and occur only once across the provided genomes?
#How many genes were you able to list?
from Bio import SeqIO
from collections import defaultdict

# 7 known genes to exclude
seven_genes = {'gapA', 'infB', 'mdh', 'pgi', 'phoE', 'rpoB', 'tonB'}

# Step 1: Read all genes
all_genes = list(SeqIO.parse("/content/concatenated_alignment.fasta", "fasta"))

# Step 2: Cluster genes by sequence
clusters = defaultdict(list)
genomes = set()

for record in all_genes:
    try:
        gene_name, genome_id = record.id.split("|")
    except ValueError:
        print(f"Invalid record ID format: {record.id}")
        continue
    if gene_name in seven_genes:
        continue  # Skip known marker genes
    clusters[str(record.seq)].append(genome_id)
    genomes.add(genome_id)

# Step 3: Identify conserved, single-copy clusters
conserved_genes = []
for seq, genome_list in clusters.items():
    genome_set = set(genome_list)
    if len(genome_set) == len(genomes) and all(genome_list.count(g) == 1 for g in genome_set):
        conserved_genes.append(seq)

print(f"\n✅ Total conserved single-copy genes (excluding the 7 known genes): {len(conserved_genes)}")



✅ Total conserved single-copy genes (excluding the 7 known genes): 0
