Skip to content

Commit

Permalink
Add Needleman-Wunsch alignment (#14)
Browse files Browse the repository at this point in the history
This is a special case in which no gaps are allowed in one of the
"sequences." The intent is to use this for aligning the TM helices of
GPCRs: the "reference sequence" is just the 14 residues at the start and
end of the reference structure's TM helices, and the "query sequence" is
the full sequence of another 7TM.

The core algorithm is supplied with a pairwise cost matrix. Convenience
methods allow you to compute the cost from sequence. Currently, cost can
be computed either from distance alone or distance + orientation of the
residues.
  • Loading branch information
timholy committed May 15, 2024
1 parent 76cb349 commit 490429f
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 35 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Rotations = "6038ab10-8711-5258-84ad-4b1120ba62dc"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand All @@ -37,6 +38,7 @@ LinearAlgebra = "1"
MIToS = "2"
MultivariateStats = "0.9, 0.10"
MutableConvexHulls = "0.2"
OffsetArrays = "1"
ProgressMeter = "1"
Rotations = "1"
StaticArrays = "1"
Expand Down
3 changes: 2 additions & 1 deletion src/GPCRAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ using MIToS.PDB

using MultivariateStats
using Distances
using OffsetArrays
using Hungarian
using ProgressMeter
using StaticArrays
Expand All @@ -32,7 +33,7 @@ export @res_str
export SequenceMapping, AccessionCode, MSACode
export species, uniprotX, query_uniprot_accession
export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain, findall_subseq
export align_to_axes, align_to_membrane
export align_to_axes, align_to_membrane, align_nw, align_ranges
export filter_species!, filter_long!, sortperm_msa, chimerax_script
export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates, alphacarbon_coordinates_matrix, mapclosest, chargelocations, positive_locations, negative_locations
export StructAlign, residueindex, ismapped
Expand Down
139 changes: 139 additions & 0 deletions src/align.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,41 @@
## Minimum-distance alignment

"""
align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping)
align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping)
Return a rotated and shifted version of `moving` so that the α-carbons of residues `moving[sm]` have least
mean square error deviation from positions `fixedpos` or those of residues `fixed`.
"""
function align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; seqname=nothing)
length(sm) == size(fixedpos, 2) || throw(DimensionMismatch("reference has $(size(fixedpos, 2)) positions, but `sm` has $(length(sm))"))
movres = moving[sm]
keep = map(!isnothing, movres)
if !all(keep)
@warn "missing $(length(keep)-sum(keep)) anchors for sequence $seqname"
end
movpos = residue_centroid_matrix(movres[keep])
fixedpos = fixedpos[:,keep]
refmean = mean(fixedpos; dims=2)
movmean = mean(movpos; dims=2)
R = MIToS.PDB.kabsch((fixedpos .- refmean)', (movpos .- movmean)')
return change_coordinates(moving, (coordinatesmatrix(moving) .- movmean') * R .+ refmean')
end
align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; kwargs...) =
align(residue_centroid_matrix(fixed), moving, sm; kwargs...)

function mapclosest(mapto::AbstractVector{PDBResidue}, mapfrom::AbstractVector{PDBResidue})
refcenters = residue_centroid_matrix(mapto)
seqcenters = residue_centroid_matrix(mapfrom)
D = pairwise(Euclidean(), refcenters, seqcenters; dims=2)
assignment, _ = hungarian(D)
return collect(zip(assignment, [j == 0 ? convert(eltype(D), NaN) : D[i, j] for (i, j) in enumerate(assignment)]))
# d, m = findmin(D; dims=1)
# return [mi[1] => di for (mi, di) in zip(vec(m), vec(d))]
end

## Align to membrane

"""
newchain = align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular=true)
Expand Down Expand Up @@ -81,3 +119,104 @@ function collect_leaflet_residues(chain, tms, extracellular)
end
return leaflet_e, leaflet_i
end

## One-sided Needleman-Wunsch alignment

@enum NWParent NONE DIAG LEFT
Base.show(io::IO, p::NWParent) = print(io, p == NONE ? 'X' :
p == DIAG ? '' :
p == LEFT ? '' : '?')
Base.show(io::IO, ::MIME"text/plain", p::NWParent) = show(io, p)

"""
ϕ = align_nw(D)
Given a penalty matrix `D` (e.g., a pairwise distance matrix), find
the optimal `ϕ` that minimizes the sum of `D[i, ϕ[i]]` for all `i`, subject
to the constraint that `ϕ[i+1] > ϕ[i]`. No gaps in `i` are allowed.
We require `size(D, 1) <= size(D, 2)`.
"""
function align_nw(D::AbstractMatrix)
m, n = size(D)
S, P = score_nw(D)
# Trace the path
ϕ = Vector{Int}(undef, m)
i, j = m, n
while i > 0
P[i,j] == DIAG && (ϕ[i] = j; i -= 1; j -= 1)
P[i,j] == LEFT && (j -= 1)
P[i,j] == NONE && break
end
return ϕ
end

"""
ϕ = align_nw(refseq, srcseq; mode=:distance)
Find the optimal `ϕ` matching `refseq[i]` to `srcseq[ϕ[i]]` for all `i`, subject
to the constraint that `ϕ[i+1] > ϕ[i]`.
"""
function align_nw(refseq::AbstractVector{PDBResidue}, srcseq::AbstractVector{PDBResidue}; mode::Symbol=Symbol("distance+scvector"))
supported_modes = (:distance, Symbol("distance+scvector"))
mode supported_modes || throw(ArgumentError("supported modes are $(supported_modes), got $mode"))

refcenters = residue_centroid_matrix(refseq)
srccenters = residue_centroid_matrix(srcseq)
D = pairwise(Euclidean(), refcenters, srccenters; dims=2)
if mode == Symbol("distance+scvector")
refsc = reduce(hcat, [scvector(r) for r in refseq])
srcsc = reduce(hcat, [scvector(r) for r in srcseq])
D = sqrt.(D.^2 + pairwise(SqEuclidean(), refsc, srcsc; dims=2))
end
return align_nw(D)
end

"""
seqtms = align_ranges(seq, ref, refranges::AbstractVector{<:AbstractUnitRange})
Transfer `refranges`, a list of reside index spans in `ref`, to `seq`. `seq` and
`ref` must be spatially aligned, and the assignment is made by minimizing
inter-chain distance subject to the constraint of preserving sequence order.
"""
function align_ranges(seq::AbstractVector{PDBResidue}, ref::AbstractVector{PDBResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
anchoridxs = sizehint!(Int[], length(refranges)*2)
for r in refranges
push!(anchoridxs, first(r), last(r))
end
issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
ϕ = align_nw(ref[anchoridxs], seq; kwargs...)
return [ϕ[i]:ϕ[i+1] for i in 1:2:length(ϕ)]
end

function score_nw(D::AbstractMatrix{T}) where T
Base.require_one_based_indexing(D)
m, n = size(D)
m <= n || throw(ArgumentError("First dimension cannot be longer than the second"))
S = OffsetMatrix{T}(undef, 0:m, 0:n)
P = OffsetMatrix{NWParent}(undef, 0:m, 0:n)
# Initialize the scoring. We need to use all rows of D while always incrementing the columns.
# Give maximum badness to paths that don't do this.
S[:,0] .= typemax(T); S[0,:] .= zero(T)
P[:,0] .= NONE; P[0,:] .= NONE
for i = 1:m
for j = 1:i-1
S[i,j], P[i,j] = typemax(T), NONE
end
for j = n-m+i+1:n
S[i,j], P[i,j] = typemax(T), NONE
end
end
# Compute the score of each possible path
for i = 1:m, j = i:n-m+i
d = D[i,j]
sd, sl = S[i-1,j-1], S[i, j-1]
if sd == typemax(T)
S[i,j], P[i,j] = sl, LEFT
else
diag = sd + d
left = sl
S[i,j], P[i,j] = diag < left ? (diag, DIAG) : (left, LEFT)
end
end
return S, P
end
34 changes: 0 additions & 34 deletions src/analyze.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,40 +78,6 @@ function alphacarbon_coordinates_matrix(seq)
return hcat([[r.atoms[findfirst(a -> a.atom === "CA", r.atoms)].coordinates...] for r in seq]...)
end

"""
align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping)
align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping)
Return a rotated and shifted version of `moving` so that the α-carbons of residues `moving[sm]` have least
mean square error deviation from positions `fixedpos` or those of residues `fixed`.
"""
function align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; seqname=nothing)
length(sm) == size(fixedpos, 2) || throw(DimensionMismatch("reference has $(size(fixedpos, 2)) positions, but `sm` has $(length(sm))"))
movres = moving[sm]
keep = map(!isnothing, movres)
if !all(keep)
@warn "missing $(length(keep)-sum(keep)) anchors for sequence $seqname"
end
movpos = residue_centroid_matrix(movres[keep])
fixedpos = fixedpos[:,keep]
refmean = mean(fixedpos; dims=2)
movmean = mean(movpos; dims=2)
R = MIToS.PDB.kabsch((fixedpos .- refmean)', (movpos .- movmean)')
return change_coordinates(moving, (coordinatesmatrix(moving) .- movmean') * R .+ refmean')
end
align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; kwargs...) =
align(residue_centroid_matrix(fixed), moving, sm; kwargs...)

function mapclosest(mapto::AbstractVector{PDBResidue}, mapfrom::AbstractVector{PDBResidue})
refcenters = residue_centroid_matrix(mapto)
seqcenters = residue_centroid_matrix(mapfrom)
D = pairwise(Euclidean(), refcenters, seqcenters; dims=2)
assignment, _ = hungarian(D)
return collect(zip(assignment, [j == 0 ? convert(eltype(D), NaN) : D[i, j] for (i, j) in enumerate(assignment)]))
# d, m = findmin(D; dims=1)
# return [mi[1] => di for (mi, di) in zip(vec(m), vec(d))]
end

"""
chargelocations(pdb::AbstractVector{PDBResidue}; include_his::Bool=false)
Expand Down
27 changes: 27 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ using Test
@test !occursin("matchmaker #2-2 to #1", script)
@test occursin("marker #2 position 5.0,4.0,3.0 radius 0.5 color blue", script)
end
@testset "Needleman-Wunsch" begin
D = [0 1 2 3 4; 1 0 1 2 3; 2 1 0 1 2; 3 2 1 0 1]
S, P = GPCRAnalysis.score_nw(D .+ 7)
@test S[end, end] == 28
@test sprint(show, MIME("text/plain"), P[1:4, 1:5]) == """
4×5 Matrix{GPCRAnalysis.NWParent}:
↖ ← X X X
X ↖ ← X X
X X ↖ ← X
X X X ↖ ←"""
ϕ = align_nw(D)
@test ϕ == [1, 2, 3, 4]
D = [0 1 2 3 4; 1 0 1 2 3; 2 1 0 1 2; 4 3 2 1 0]
ϕ = align_nw(D)
@test ϕ == [1, 2, 3, 5]
D = [0 1 2 3 4; 1 0 1 2 3; 3 2 1 0 1; 4 3 2 1 0]
ϕ = align_nw(D)
@test ϕ == [1, 2, 4, 5]
D = [1 0 1 2 3; 3 2 1 0 1; 4 3 2 1 0; 5 4 3 2 1]
ϕ = align_nw(D)
@test ϕ == [2, 3, 4, 5]
@test_throws "First dimension cannot be longer than the second" align_nw(rand(5, 4))

opsd = read("AF-P15409-F1-model_v4.pdb", PDBFile)
opsd_tms = [37:61, 74:96, 111:133, 153:173, 203:224, 253:274, 287:308]
@test align_ranges(opsd, opsd, opsd_tms) == opsd_tms
end
@testset "Pocket residues and features" begin
opsd = read("AF-P15409-F1-model_v4.pdb", PDBFile)
opsdr = [three2residue(r.id.name) for r in opsd]
Expand Down

0 comments on commit 490429f

Please sign in to comment.