Skip to content

Commit

Permalink
Merge pull request #64 from KatyBrown/roman3
Browse files Browse the repository at this point in the history
finished function for plotting residue frequencies
  • Loading branch information
KatyBrown committed May 16, 2024
2 parents 8667a73 + 2dabc36 commit 198a822
Show file tree
Hide file tree
Showing 77 changed files with 2,708 additions and 46 deletions.
32 changes: 32 additions & 0 deletions CIAlign/argP.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,28 @@ def getParser():
identical to or differ from the consensus. \
Default: %(default)s""")

optional.add("--plot_consensus_similarity",
dest='plot_consensus_similarity', action='store_true',
help="""Plot mini alignment showing \
positions based on their score when \
when compared via a substitution matrix to the consensus. \
Default: %(default)s""")

optional.add("--plot_substitution_matrix", dest='plot_substitution_matrix',
type=str, default='B',
help="""Substitution matrix to use for similarity plots. \
Default: %(default)s""")

optional.add("--plot_identity_palette", dest='plot_identity_palette',
type=str, default='terrain_r',
help="""Matplotlib palette name for identity mini alignments. \
Default: %(default)s""")

optional.add("--plot_similarity_palette", dest='plot_similarity_palette',
type=str, default='summer_r',
help="""Matplotlib palette name for similarity mini \
alignments. Default: %(default)s""")

optional.add("--plot_dpi", dest="plot_dpi",
type=int, default=300, metavar="(int)",
help="DPI for mini alignments. Default: %(default)s")
Expand Down Expand Up @@ -687,6 +709,16 @@ def getParser():
help="Width for statistics plots (inches). Default: \
%(default)s")

optional.add("--plot_stats_height_bar", dest="plot_stats_height_bar",
type=int, default=5, metavar="(int)",
help="Height for statistics bar plots (inches). Default: \
%(default)s")

optional.add("--plot_stats_width_bar", dest="plot_stats_width_bar",
type=int, default=3, metavar="(int)",
help="Width for statistics bar plots (inches). Default: \
%(default)s")

optional.add("--plot_stats_colour", dest="plot_stats_colour",
type=str, default='#007bf5', metavar="(string)",
help="Colour for statistics plots (hex code or name). \
Expand Down
233 changes: 198 additions & 35 deletions CIAlign/consensusSeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
import utilityFunctions
import os
import scipy.stats
import copy
import operator
import pandas as pd
matplotlib.use('Agg')


Expand Down Expand Up @@ -692,43 +695,203 @@ def calcConservationAli(alignment, typ):
return (heights, ents)


def compareAlignmentConsensus(arr):
def compareAlignmentConsensus(arr, typ, booleanOrSimilarity="Boolean",
MatrixName="B"):
'''
Compares the alignment of the inputted array to the consensus of that
array, and will either output a boolean array, or will use a matrix
(can be specified) to output an array containing the scores of the
sequence compared to the consensus.
Parameters
----------
arr: np.array
the sequence thats is to be aligned
typ: str
nt or aa
booleanOrSimilarity: str
boolean or similarity (default = 'Boolean')
MatrixName: str
the specified matrix name (default = 'B')
Returns
-------
new_arr: np.array
A boolean array containing the values of the
sequence compared to the consensus
new_Sarr: np.array
A integer array containing the values of the
sequence compared to the consensus via a matrix
'''
consensus, _ = np.array(findConsensus(arr, '',
consensus_type='majority_nongap') )
ci_dir = os.path.dirname(utilityFunctions.__file__)
matrix_dir = "%s/similarity_matrices" % (ci_dir)

if booleanOrSimilarity == "Boolean":
bool_array = np.array([])
bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
# declares the numpy arrays
for e in range(1, (len(arr[:,0])+1)):
# iterates over the rows of the sequences
z = e-1
for i in range(1, (len(arr[0,:])+1)):
# iterates over the columns of the sequences
x = i-1
if arr[z,x] == consensus[x]:
# verifies if the current value being iterated is equal to the
#equivalent value inline with the consensus
bool_array = np.append(bool_array, [True], axis=None)
else:
bool_array = np.append(bool_array, [False], axis=None)
bool_arrL = np.vstack([bool_arrL, bool_array])
bool_array = np.array([])

new_arr = copy.deepcopy(bool_arrL)
new_arr = bool_arrL.astype(bool)
# returns the new boolean array containing the verified alignment to
#the consensus
return new_arr
else:
# generates the consensus
Sarray = np.array([])
SarrL = np.empty(dtype=int, shape=(0, len(consensus)))
# declares the numpy arrays
tab = pd.read_csv("%s/matrices.txt" % ci_dir, sep="\t", index_col=0)
if typ == "aa":
# verifies if the typ is amino acid or nucleotide
if MatrixName != "B":
if tab.loc[MatrixName][0] != typ:
raise RuntimeError("This matrix is not valid")
# verifies if the matrix is valid
else:
# verifies if the user would like to use the default matrix or
#their own
mat = pd.read_csv("%s/%s" % (matrix_dir, MatrixName),
comment="#", sep="\s+")
elif MatrixName == "B":
mat = pd.read_csv("%s/BLOSUM62" % (matrix_dir),
comment="#", sep="\s+")
elif typ == "nt":
if MatrixName != "B":
if tab.loc[MatrixName][0] != typ:
raise RuntimeError("This matrix is not valid")
# verifies if the matrix is valid
else:
# verifies if the user would like to use the default
# matrix or their own
mat = pd.read_csv("%s/%s" % (matrix_dir, MatrixName),
comment="#", sep="\s+")
elif MatrixName == "B":
mat = pd.read_csv("%s/NUC.4.4" % (matrix_dir), comment="#",
sep="\s+")
for e in range(1, (len(arr[:,0])+1)):
# iterates over the rows of the sequences
z = e-1
for i in range(1, (len(arr[0,:])+1)):
# iterates over the columns of the sequences
x = i-1
if not arr[z, x] == "-":
if arr[z, x] == "U" and typ == 'nt':
thischar = "T"
else:
thischar = arr[z, x]
if consensus[x] == "U" and typ == 'nt':
conschar = "T"
else:
conschar = consensus[x]
score = mat.loc[thischar, conschar]
Sarray = np.append(Sarray, [score])
elif arr[z,x] == "-":
# sets the value of '-' as 0
Sarray = np.append(Sarray, 0)
SarrL = np.vstack([SarrL, Sarray])
Sarray = np.array([])
new_Sarr = copy.deepcopy(SarrL)
new_Sarr = SarrL.astype(int)
# returns the new similarity array containing the verified alignment
# to the consensus
return new_Sarr


def plotResidueFrequencies(arr, typ, outfile, dpi=300, width=3, height=5):
'''
Compares the alignment of the input array to the consensus of that array,
and outputs a boolean array.
Plots a bar chart/graph of the percantage of the occurance of bases in the
given sequence, also plots a graph of the comparison between the amount
of C/G bases present to the amount of A/T bases.
Parameters
----------
alignment: np.array
The alignment stored as a numpy array
-----------
arr: np.array
The sequence stored as a numpy array
Returns
-------
A numpy array stored as new_arr, which is a boolean array
comparing the arr to its consensus.
typ: str
nt or aa
Outputs
-----------
the bar charts/graphs
'''
consensus, _ = np.array(findConsensus(arr, '',
consensus_type='majority_nongap'))
bool_array = np.array([])
bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
# declares the numpy arrays
for e in range(1, (len(arr[:, 0])+1)):
# iterates over the rows of the sequences
z = e - 1
for i in range(1, (len(arr[0, :])+1)):
# iterates over the columns of the sequences
x = i - 1
if arr[z, x] == consensus[x]:
# verifies if the current value being iterated is equal to
# the equivalent value inline with the consensus
bool_array = np.append(bool_array, [True], axis=None)
else:
bool_array = np.append(bool_array, [False], axis=None)
bool_arrL = np.vstack([bool_arrL,
bool_array])
bool_array = np.array([])
new_arr = copy.deepcopy(bool_arrL)
new_arr = bool_arrL.astype(bool)
# returns the new boolean array containing the verified alignment
# to the consensus
return new_arr
if typ == "nt":
Slist = list(utilityFunctions.getNtColours().keys())[0:5]
Scols = list(utilityFunctions.getNtColours().values())[0:5]
elif typ == "aa":
Slist = list(utilityFunctions.getAAColours().keys())[0:21]
Scols = list(utilityFunctions.getAAColours().values())[0:21]
data = np.array([])
baseT = np.array([])
baseN = np.array([])
data2 = float(0)
data3 = float(0)
if typ == 'aa':
width = width * 3
f = plt.figure(figsize=(width, height))
plt.subplots_adjust(hspace=0.5)
for i in range(1, len(Slist)):
q = i - 1
if Slist[q] == 'C' or Slist[q] == 'G':
data2 = data2 + np.sum(arr == Slist[q])
if Slist[q] == 'A' or Slist[q] == 'T' or Slist[q] == 'U':
data3 = data3 + np.sum(arr == Slist[q])
if typ == 'nt' and Slist[q] == 'T':
thischar = "U"
else:
thischar = Slist[q]
data = (np.sum(arr == thischar) / np.size(arr))
baseT = np.append(baseT, data)
baseN = np.append(baseN, str(thischar))
if typ == 'nt':
subplot1 = f.add_subplot(2, 1, 1)
subplot1.set_xlabel("Nucleotide")
else:
subplot1 = f.add_subplot(1, 1, 1)
subplot1.set_xlabel("Amino Acid")
subplot1.set_ylabel("Proportion")
subplot1.bar(baseN, baseT, color=Scols, width=0.4)
subplot1.spines['right'].set_visible(False)
subplot1.spines['top'].set_visible(False)
subplot1.set_title("Proportion of Residues in Alignment")

if typ == 'nt':
subplot2 = f.add_subplot(2, 1, 2)
if "U" in arr:
label = "A or U"
subplot2.set_title("CG and AU Proportion")
else:
label = "A or T"
subplot2.set_title("CG and AT Proportion")
subplot2.bar(["C or G", label], [(data2 / (data3 + data2)),
(data3 / (data3 + data2))],
color="Orange", width=0.4)
subplot2.spines['right'].set_visible(False)
subplot2.spines['top'].set_visible(False)
subplot2.set_ylabel("Percentage")
subplot2.set_xlabel("Nucleotides")

plt.savefig(outfile, dpi=dpi, bbox_inches='tight')
plt.close()
73 changes: 73 additions & 0 deletions CIAlign/matrices.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
Matrix Type
BLOSUM100 aa
BLOSUM30 aa
BLOSUM35 aa
BLOSUM40 aa
BLOSUM45 aa
BLOSUM50 aa
BLOSUM55 aa
BLOSUM60 aa
BLOSUM62 aa
BLOSUM65 aa
BLOSUM70 aa
BLOSUM75 aa
BLOSUM80 aa
BLOSUM85 aa
BLOSUM90 aa
BLOSUMN aa
DAYHOFF aa
GONNET aa
IDENTITY aa
MATCH aa
NUC.4.4 nt
NUC.4.2 nt
PAM10 aa
PAM100 aa
PAM110 aa
PAM120 aa
PAM130 aa
PAM140 aa
PAM150 aa
PAM160 aa
PAM170 aa
PAM180 aa
PAM190 aa
PAM20 aa
PAM200 aa
PAM210 aa
PAM220 aa
PAM230 aa
PAM240 aa
PAM250 aa
PAM260 aa
PAM270 aa
PAM280 aa
PAM290 aa
PAM30 aa
PAM300 aa
PAM310 aa
PAM320 aa
PAM330 aa
PAM340 aa
PAM350 aa
PAM360 aa
PAM370 aa
PAM380 aa
PAM390 aa
PAM40 aa
PAM400 aa
PAM410 aa
PAM420 aa
PAM430 aa
PAM440 aa
PAM450 aa
PAM460 aa
PAM470 aa
PAM480 aa
PAM490 aa
PAM50 aa
PAM500 aa
PAM60 aa
PAM70 aa
PAM80 aa
PAM90 aa
16 changes: 13 additions & 3 deletions CIAlign/miniAlignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,10 @@ def drawMarkUpLegend(outfile, palette="CBS"):
def drawMiniAlignment(arr, nams, log, outfile, typ, plot_type='standard',
dpi=300, title=None, width=5, height=3, markup=False,
markupdict=None, ret=False, orig_nams=[],
keep_numbers=False, force_numbers=False, palette="CBS"):
keep_numbers=False, force_numbers=False, palette="CBS",
sub_matrix_name='B', plot_identity_palette='terrain_r',
plot_similarity_palette='summer_r',
plot_substitution_matrix='B'):
'''
Draws a "mini alignment" image showing a small representation of the
whole alignment so that gaps and poorly aligned regions are visible.
Expand Down Expand Up @@ -322,8 +325,15 @@ def drawMiniAlignment(arr, nams, log, outfile, typ, plot_type='standard',
if plot_type == 'standard':
arr2, cm = arrNumeric(arr, typ, palette)
elif plot_type == 'boolean':
arr2 = consensusSeq.compareAlignmentConsensus(arr)
cm = matplotlib.colormaps['jet']
arr2 = consensusSeq.compareAlignmentConsensus(
arr, typ=typ, booleanOrSimilarity="Boolean")
cm = matplotlib.colormaps[plot_identity_palette]
elif plot_type == 'similarity':
arr2 = consensusSeq.compareAlignmentConsensus(
arr, typ=typ, booleanOrSimilarity="similarity",
MatrixName=sub_matrix_name)
cm = matplotlib.colormaps[plot_similarity_palette]

# display it on the axis
a.imshow(arr2, cmap=cm, aspect='auto', interpolation='nearest')

Expand Down

0 comments on commit 198a822

Please sign in to comment.