In [6]:
import os
import pandas as pd
from Bio import Phylo
from pathlib import Path
if not Path('jw_utils').exists():
    !git clone https://github.com/JonWinkelman/jw_utils.git
from jw_utils import itol_annotations as ia
from jw_utils import ncbi_utils as nu
from jw_utils import parse_fasta as pfa
if not Path('orthofinder_utils').exists():
    !git clone https://github.com/JonWinkelman/orthofinder_utils.git
from orthofinder_utils import dash_ortho_parser as dop
from orthofinder_utils import run_orthofinder as ru
import external_functions as ef

### Download proteomes and GFF files from NCBI datasets

In [7]:
figure_genome_assemblies = {
    'GCF_001077675.1': 'Acinetobacter baumannii ATCC 17978-mff',
    'GCF_000018445.1': 'Acinetobacter baumannii ACICU',
    'GCF_000021245.2': 'Acinetobacter baumannii AB0057',
    'GCF_000770605.1': 'Acinetobacter baumannii AB5075',
    'GCF_005281455.1': 'Acinetobacter nosocomialis M2',
    'GCF_002055515.1': 'Acinetobacter calcoaceticus CA16',
    'GCF_001682515.1': 'Acinetobacter gyllenbergii FMP01',
    'GCF_000413935.1': 'Acinetobacter colistiniresistens NIPH 2036',
    'GCF_000046845.1': 'Acinetobacter baylyi ADP1',
    'GCF_000368805.1': 'Acinetobacter johnsonii ANC 3681',
    'GCF_004208515.1': 'Acinetobacter halotolerans JCM 31009',
    'GCF_001105265.1': 'Yersinia enterocolitica SC9312-78',
    }
figure_accs = list(figure_genome_assemblies.keys())

#### Download assembly summaries

In [8]:
nu.download_assembly_summaries_from_list(figure_accs)
summary_df = nu.make_summary_df('summaries.json')

temp_file created at /var/folders/vw/7lg51dfd3ql9g_j55xz_3dqh0000gn/T/tmpbjrkrvx7


## Download proteomes/gffs and run orthofinder 2.5.5 on proteomes

In [18]:
nu.download_genomes_from_acclist(figure_accs)
orthofinder_base_dir   = Path('orthofinder_analysis')
orthofinder_output_dir = orthofinder_base_dir / "data/orthofinder_results"
proteomes_dir = orthofinder_base_dir / 'data/proteomes'
proteomes_dir.mkdir(exist_ok=True, parents=True)
nu.copy_ncbi_files('./ncbi_dataset/ncbi_dataset/data/', proteomes_dir)

temp_file created at /var/folders/vw/7lg51dfd3ql9g_j55xz_3dqh0000gn/T/tmp3gahml9y
Error: Collecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
[1A[2KCollecting 12 genome records [------------------------------------------------]   0% 0/12
Downloading: ncbi_dataset.zip    847B

['GCF_000046845.1.faa',
 'GCF_002055515.1.faa',
 'GCF_000021245.2.faa',
 'GCF_000770605.1.faa',
 'GCF_001077675.1.faa',
 'GCF_000018445.1.faa',
 'GCF_000413935.1.faa',
 'GCF_001105265.1.faa',
 'GCF_000368805.1.faa',
 'GCF_005281455.1.faa',
 'GCF_004208515.1.faa',
 'GCF_001682515.1.faa']

In [19]:
append = "Acinetobacter"
ru.run_orthofinder(str(proteomes_dir), o=orthofinder_output_dir, n=append)
orthofinder_output_dir = orthofinder_output_dir / f'Results_{append}'

Running: orthofinder -f orthofinder_analysis/data/proteomes -o orthofinder_analysis/data/orthofinder_results -n Acinetobacter

OrthoFinder version 2.5.5 Copyright (C) 2014 David Emms

2025-09-24 10:57:25 : Starting OrthoFinder 2.5.5
14 thread(s) for highly parallel tasks (BLAST searches etc.)
1 thread(s) for OrthoFinder algorithm

Checking required programs are installed
----------------------------------------
Test can run "mcl -h" - ok
Test can run "fastme -i orthofinder_analysis/data/orthofinder_results/Results_Acinetobacter/WorkingDirectory/dependencies/SimpleTest.phy -o orthofinder_analysis/data/orthofinder_results/Results_Acinetobacter/WorkingDirectory/dependencies/SimpleTest.tre" - ok

Dividing up work for BLAST for parallel processing
--------------------------------------------------
2025-09-24 10:57:25 : Creating diamond database 1 of 12
2025-09-24 10:57:25 : Creating diamond database 2 of 12
2025-09-24 10:57:25 : Creating diamond database 3 of 12
2025-09-24 10:57:25 : Creati

### Parse orthofinder output to find HOGs. 

In [20]:
#make summary files
summary_dir = orthofinder_output_dir / 'summary_data'
proteomes_dir = orthofinder_output_dir / 'proteomes'
gffs_dir = Path(orthofinder_output_dir / 'gffs')

summary_dir.mkdir(exist_ok=True)
proteomes_dir.mkdir(exist_ok=True)
gffs_dir.mkdir(exist_ok=True, parents=True)

summary_df['organism_name'].to_json(summary_dir / 'AssemblyAccession_to_SpeciesName.json')

## move gffs to folder
nu.move_gffs(    data_dir  = './ncbi_dataset/ncbi_dataset/data/', new_gff_dir = gffs_dir)
nu.move_proteomes('./ncbi_dataset/ncbi_dataset/data/', proteomes_dir)
dop_obj = dop.DashOrthoParser(orthofinder_output_dir)



In [21]:
genome_annotation_df = dop_obj.make_genome_annotation_df('GCF_001077675.1',get_common_names=True)
astA = 'gene-ACX60_RS01825'

astR = 'gene-ACX60_RS12905'
astN = 'gene-ACX60_RS12900'
astO = 'gene-ACX60_RS12895'
astP = 'gene-ACX60_RS12890'

astR_HOG = genome_annotation_df.set_index('Parents').loc['gene-ACX60_RS12905', 'HOGs']
astR_orthologs = dop_obj.all_prots_in_HOG(astR_HOG)

astR_accs = [a[:15][::-1].replace('_', '.', 1)[::-1] for a in astR_orthologs]
astR_proteins = dop_obj.get_HOG_protein_seqs(astR_HOG)
astR_seqs_d = astR_proteins['Protein_seq'].to_dict()

### Align with muscle and Run RAxML

In [25]:
tree_dir = Path('raxML_tree')
tree_dir.mkdir(exist_ok=True)
astR_seqs_fp = tree_dir / 'astR_orthologs.faa'
pfa.write_to_fasta(astR_seqs_d, astR_seqs_fp)

In [26]:
muscle_out= str(astR_seqs_fp).replace('.faa', '.muscle.aln')
ef.run_muscle(astR_seqs_d, muscle_out)

Running: muscle -in /var/folders/vw/7lg51dfd3ql9g_j55xz_3dqh0000gn/T/muscle_vy7fb4qb.fa -out raxML_tree/astR_orthologs.muscle.aln
MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

muscle_vy7fb4qb 12 seqs, lengths min 140, max 153, avg 141
00:00:00      2 MB(0%)  Iter   1    1.28%  K-mer dist pass 1
00:00:00      2 MB(0%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00      2 MB(0%)  Iter   1    1.28%  K-mer dist pass 2
00:00:00      2 MB(0%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00      2 MB(0%)  Iter   1    9.09%  Align node       
00:00:00      3 MB(0%)  Iter   1   18.18%  Align node
00:00:00      3 MB(0%)  Iter   1   27.27%  Align node
00:00:00      3 MB(0%)  Iter   1   36.36%  Align node
00:00:00      3 MB(0%)  Iter   1   45.45%  Align node
00:00:00      3 MB(0%)  Iter   1   54.55%  Align node
00:00:00      3 MB(0%)  Iter   1   63.64%  Align node
00:00:00 

'raxML_tree/astR_orthologs.muscle.aln'

In [27]:
raxml_out_dir = tree_dir / 'raxML_output'
ef.run_raxml(muscle_out, raxml_out_dir, prefix='AstR', threads=8, model='LG', raxml_bin='raxmlHPC')

Running: raxmlHPC -T 8 -m PROTGAMMALG -p 12345 -x 12345 -# 100 -f a -s /Users/jonathanwinkelman/Trestle/Palmer_lab/Geary_et_al_2025/raxML_tree/astR_orthologs.muscle.aln -n AstR -w /Users/jonathanwinkelman/Trestle/Palmer_lab/Geary_et_al_2025/raxML_tree/raxML_output
Option -T does not have any effect with the sequential or parallel MPI version.
It is used to specify the number of threads for the Pthreads-based parallelization
Keep in mind that RAxML only accepts absolute path names, not relative ones!

RAxML can't, parse the alignment file as phylip file 
it will now try to parse it as FASTA file




Found 1 sequence that is exactly identical to other sequences in the alignment.
Normally they should be excluded from the analysis.

Just in case you might need it, an alignment file with 
sequence duplicates removed is printed to file /Users/jonathanwinkelman/Trestle/Palmer_lab/Geary_et_al_2025/raxML_tree/astR_orthologs.muscle.aln.reduced


This is RAxML version 8.2.12 released by Alexandro

In [28]:
itol_annotations_dir = tree_dir / 'itol_annotations'
itol_annotations_dir.mkdir(exist_ok=True)
tree = Phylo.read(file=f'{raxml_out_dir}/RAxML_bipartitionsBranchLabels.AstR', format='newick')
relable_d = {}
for cl in tree.get_terminals():
    acc = cl.name[:15][::-1].replace('_', '.', 1)[::-1]
    relable_d[cl.name] = dop_obj.accession_to_name[acc]
itol_relable_out = itol_annotations_dir / 'RELABLE_RAxML_bipartitionsBranchLabels.astR'
ia.relabel_itol_treeleafs(tree,relable_d, itol_relable_out )