Merge pull request #64 from KatyBrown/roman3

finished function for plotting residue frequencies
KatyBrown · May 16, 2024 · 198a822 · 198a822
2 parents 8667a73 + 2dabc36
commit 198a822
Show file tree

Hide file tree

Showing 77 changed files with 2,708 additions and 46 deletions.
diff --git a/CIAlign/argP.py b/CIAlign/argP.py
@@ -518,6 +518,28 @@ def getParser():
                  identical to or differ from the consensus. \
                  Default: %(default)s""")
 
+    optional.add("--plot_consensus_similarity",
+                  dest='plot_consensus_similarity', action='store_true',
+                 help="""Plot mini alignment showing \
+                 positions based on their score when \
+                 when compared via a substitution matrix to the consensus. \
+                 Default: %(default)s""")
+
+    optional.add("--plot_substitution_matrix", dest='plot_substitution_matrix',
+                 type=str, default='B',
+                 help="""Substitution matrix to use for similarity plots. \
+                 Default: %(default)s""")
+
+    optional.add("--plot_identity_palette", dest='plot_identity_palette',
+                 type=str, default='terrain_r',
+                 help="""Matplotlib palette name for identity mini alignments. \
+                 Default: %(default)s""")
+
+    optional.add("--plot_similarity_palette", dest='plot_similarity_palette',
+                 type=str, default='summer_r',
+                 help="""Matplotlib palette name for similarity mini \
+                 alignments. Default: %(default)s""")
+
     optional.add("--plot_dpi", dest="plot_dpi",
                  type=int, default=300, metavar="(int)",
                  help="DPI for mini alignments. Default: %(default)s")
@@ -687,6 +709,16 @@ def getParser():
                  help="Width for statistics plots (inches). Default: \
                        %(default)s")
 
+    optional.add("--plot_stats_height_bar", dest="plot_stats_height_bar",
+                 type=int, default=5, metavar="(int)",
+                 help="Height for statistics bar plots (inches). Default: \
+                       %(default)s")
+
+    optional.add("--plot_stats_width_bar", dest="plot_stats_width_bar",
+                 type=int, default=3, metavar="(int)",
+                 help="Width for statistics bar plots (inches). Default: \
+                       %(default)s")
+
     optional.add("--plot_stats_colour", dest="plot_stats_colour",
                  type=str, default='#007bf5', metavar="(string)",
                  help="Colour for statistics plots (hex code or name). \

diff --git a/CIAlign/consensusSeq.py b/CIAlign/consensusSeq.py
@@ -16,6 +16,9 @@
     import utilityFunctions
 import os
 import scipy.stats
+import copy
+import operator
+import pandas as pd
 matplotlib.use('Agg')
 
 
@@ -692,43 +695,203 @@ def calcConservationAli(alignment, typ):
     return (heights, ents)
 
 
-def compareAlignmentConsensus(arr):
+def compareAlignmentConsensus(arr, typ, booleanOrSimilarity="Boolean",
+                              MatrixName="B"):
+    '''
+    Compares the alignment of the inputted array to the consensus of that
+    array, and will either output a boolean array, or will use a matrix 
+    (can be specified) to output an array containing the scores of the 
+    sequence compared to the consensus.
+
+
+
+        Parameters
+        ----------
+        arr: np.array
+            the sequence thats is to be aligned
+
+        typ: str
+            nt or aa
+        
+        booleanOrSimilarity: str
+            boolean or similarity (default = 'Boolean')
+
+        MatrixName: str
+            the specified matrix name (default = 'B')
+            
+        Returns
+        -------
+        new_arr: np.array
+            A boolean array containing the values of the 
+                        sequence compared to the consensus
+        new_Sarr: np.array
+            A integer array containing the values of the 
+            sequence compared to the consensus via a matrix
+    '''
+    consensus, _ = np.array(findConsensus(arr, '',
+                                          consensus_type='majority_nongap')  )
+    ci_dir = os.path.dirname(utilityFunctions.__file__)
+    matrix_dir = "%s/similarity_matrices" % (ci_dir)
+
+    if booleanOrSimilarity == "Boolean":
+        bool_array = np.array([])
+        bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
+        # declares the numpy arrays
+        for e in range(1, (len(arr[:,0])+1)):
+            # iterates over the rows of the sequences
+            z = e-1
+            for i in range(1, (len(arr[0,:])+1)):
+                # iterates over the columns of the sequences
+                x = i-1
+                if arr[z,x] == consensus[x]:
+                    # verifies if the current value being iterated is equal to the
+                    #equivalent value inline with the consensus
+                    bool_array = np.append(bool_array, [True], axis=None)
+                else:
+                    bool_array = np.append(bool_array, [False], axis=None)
+            bool_arrL = np.vstack([bool_arrL, bool_array])
+            bool_array = np.array([])
+
+        new_arr = copy.deepcopy(bool_arrL)
+        new_arr = bool_arrL.astype(bool)
+        # returns the new boolean array containing the verified alignment to
+        #the consensus
+        return new_arr
+    else:
+        # generates the consensus
+        Sarray = np.array([])
+        SarrL = np.empty(dtype=int, shape=(0, len(consensus)))
+        # declares the numpy arrays
+        tab = pd.read_csv("%s/matrices.txt" % ci_dir, sep="\t", index_col=0)
+        if typ == "aa":
+          # verifies if the typ is amino acid or nucleotide
+          if MatrixName != "B":
+            if tab.loc[MatrixName][0] != typ:
+              raise RuntimeError("This matrix is not valid")
+              # verifies if the matrix is valid
+            else:
+              # verifies if the user would like to use the default matrix or
+              #their own
+              mat = pd.read_csv("%s/%s" % (matrix_dir, MatrixName),
+                                comment="#", sep="\s+")
+          elif MatrixName == "B":
+            mat = pd.read_csv("%s/BLOSUM62" % (matrix_dir),
+                              comment="#", sep="\s+")
+        elif typ == "nt":
+            if MatrixName != "B":
+                if tab.loc[MatrixName][0] != typ:
+                    raise RuntimeError("This matrix is not valid")
+                    # verifies if the matrix is valid
+                else:
+                    # verifies if the user would like to use the default
+                    # matrix or their own
+                    mat = pd.read_csv("%s/%s" % (matrix_dir, MatrixName),
+                                      comment="#", sep="\s+")
+            elif MatrixName == "B":
+                mat = pd.read_csv("%s/NUC.4.4" % (matrix_dir), comment="#",
+                                  sep="\s+")
+        for e in range(1, (len(arr[:,0])+1)):
+            # iterates over the rows of the sequences
+            z = e-1
+            for i in range(1, (len(arr[0,:])+1)):
+                #  iterates over the columns of the sequences
+                x = i-1
+                if not arr[z, x] == "-":
+                    if arr[z, x] == "U" and typ == 'nt':
+                        thischar = "T"
+                    else:
+                        thischar = arr[z, x]
+                    if consensus[x] == "U" and typ == 'nt':
+                        conschar = "T"
+                    else:
+                        conschar = consensus[x]
+                    score = mat.loc[thischar, conschar]
+                    Sarray = np.append(Sarray, [score])
+                elif arr[z,x] == "-":
+                      # sets the value of '-' as 0
+                      Sarray = np.append(Sarray, 0)
+            SarrL = np.vstack([SarrL, Sarray])
+            Sarray = np.array([])
+        new_Sarr = copy.deepcopy(SarrL)
+        new_Sarr = SarrL.astype(int)
+        # returns the new similarity array containing the verified alignment
+        # to the consensus
+        return new_Sarr
+
+
+def plotResidueFrequencies(arr, typ, outfile, dpi=300, width=3, height=5):
     '''
-    Compares the alignment of the input array to the consensus of that array,
-    and outputs a boolean array.
+    Plots a bar chart/graph of the percantage of the occurance of bases in the
+    given sequence, also plots a graph of the comparison between the amount
+    of C/G bases present to the amount of A/T bases.
 
     Parameters
-    ----------
-    alignment: np.array
-        The alignment stored as a numpy array
+    -----------
+    arr: np.array
+        The sequence stored as a numpy array
 
-    Returns
-    -------
-    A numpy array stored as new_arr, which is a boolean array
-    comparing the arr to its consensus.
+    typ: str
+        nt or aa
+
+    Outputs
+    -----------
+    the bar charts/graphs
     '''
-    consensus, _ = np.array(findConsensus(arr, '',
-                                          consensus_type='majority_nongap'))
-    bool_array = np.array([])
-    bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
-    # declares the numpy arrays
-    for e in range(1, (len(arr[:, 0])+1)):
-        # iterates over the rows of the sequences
-        z = e - 1
-        for i in range(1, (len(arr[0, :])+1)):
-            # iterates over the columns of the sequences
-            x = i - 1
-            if arr[z, x] == consensus[x]:
-                # verifies if the current value being iterated is equal to
-                # the equivalent value inline with the consensus
-                bool_array = np.append(bool_array, [True], axis=None)
-            else:
-                bool_array = np.append(bool_array, [False], axis=None)
-        bool_arrL = np.vstack([bool_arrL,
-                               bool_array])
-        bool_array = np.array([])
-    new_arr = copy.deepcopy(bool_arrL)
-    new_arr = bool_arrL.astype(bool)
-    # returns the new boolean array containing the verified alignment
-    # to the consensus
-    return new_arr
+    if typ == "nt":
+        Slist = list(utilityFunctions.getNtColours().keys())[0:5]
+        Scols = list(utilityFunctions.getNtColours().values())[0:5]
+    elif typ == "aa":
+        Slist = list(utilityFunctions.getAAColours().keys())[0:21]
+        Scols = list(utilityFunctions.getAAColours().values())[0:21]
+    data = np.array([])
+    baseT = np.array([])
+    baseN = np.array([])
+    data2 = float(0)
+    data3 = float(0)
+    if typ == 'aa':
+        width = width * 3
+    f = plt.figure(figsize=(width, height))
+    plt.subplots_adjust(hspace=0.5)
+    for i in range(1, len(Slist)):
+        q = i - 1
+        if Slist[q] == 'C' or Slist[q] == 'G':
+            data2 = data2 + np.sum(arr == Slist[q])
+        if Slist[q] == 'A' or Slist[q] == 'T' or Slist[q] == 'U':
+            data3 = data3 + np.sum(arr == Slist[q])
+        if typ == 'nt' and Slist[q] == 'T':
+            thischar = "U"
+        else:
+            thischar = Slist[q]
+        data = (np.sum(arr == thischar) / np.size(arr))
+        baseT = np.append(baseT, data)
+        baseN = np.append(baseN, str(thischar))
+    if typ == 'nt':
+        subplot1 = f.add_subplot(2, 1, 1)
+        subplot1.set_xlabel("Nucleotide")
+    else:
+        subplot1 = f.add_subplot(1, 1, 1)
+        subplot1.set_xlabel("Amino Acid")
+    subplot1.set_ylabel("Proportion")
+    subplot1.bar(baseN, baseT, color=Scols, width=0.4)
+    subplot1.spines['right'].set_visible(False)
+    subplot1.spines['top'].set_visible(False)
+    subplot1.set_title("Proportion of Residues in Alignment")
+
+    if typ == 'nt':
+        subplot2 = f.add_subplot(2, 1, 2)
+        if "U" in arr:
+            label = "A or U"
+            subplot2.set_title("CG and AU Proportion")
+        else:
+            label = "A or T"
+            subplot2.set_title("CG and AT Proportion")
+        subplot2.bar(["C or G", label], [(data2 / (data3 + data2)),
+                                         (data3 / (data3 + data2))],
+                     color="Orange", width=0.4)
+        subplot2.spines['right'].set_visible(False)
+        subplot2.spines['top'].set_visible(False)
+        subplot2.set_ylabel("Percentage")
+        subplot2.set_xlabel("Nucleotides")
+
+    plt.savefig(outfile, dpi=dpi, bbox_inches='tight')
+    plt.close()
diff --git a/CIAlign/matrices.txt b/CIAlign/matrices.txt
@@ -0,0 +1,73 @@
+Matrix	Type
+BLOSUM100	aa
+BLOSUM30	aa
+BLOSUM35	aa
+BLOSUM40	aa
+BLOSUM45	aa
+BLOSUM50	aa
+BLOSUM55	aa
+BLOSUM60	aa
+BLOSUM62	aa
+BLOSUM65	aa
+BLOSUM70	aa
+BLOSUM75	aa
+BLOSUM80	aa
+BLOSUM85	aa
+BLOSUM90	aa
+BLOSUMN	aa
+DAYHOFF	aa
+GONNET	aa
+IDENTITY	aa
+MATCH	aa
+NUC.4.4	nt
+NUC.4.2	nt
+PAM10	aa
+PAM100	aa
+PAM110	aa
+PAM120	aa
+PAM130	aa
+PAM140	aa
+PAM150	aa
+PAM160	aa
+PAM170	aa
+PAM180	aa
+PAM190	aa
+PAM20	aa
+PAM200	aa
+PAM210	aa
+PAM220	aa
+PAM230	aa
+PAM240	aa
+PAM250	aa
+PAM260	aa
+PAM270	aa
+PAM280	aa
+PAM290	aa
+PAM30	aa
+PAM300	aa
+PAM310	aa
+PAM320	aa
+PAM330	aa
+PAM340	aa
+PAM350	aa
+PAM360	aa
+PAM370	aa
+PAM380	aa
+PAM390	aa
+PAM40	aa
+PAM400	aa
+PAM410	aa
+PAM420	aa
+PAM430	aa
+PAM440	aa
+PAM450	aa
+PAM460	aa
+PAM470	aa
+PAM480	aa
+PAM490	aa
+PAM50	aa
+PAM500	aa
+PAM60	aa
+PAM70	aa
+PAM80	aa
+PAM90	aa
diff --git a/CIAlign/miniAlignments.py b/CIAlign/miniAlignments.py
@@ -250,7 +250,10 @@ def drawMarkUpLegend(outfile, palette="CBS"):
 def drawMiniAlignment(arr, nams, log, outfile, typ, plot_type='standard',
                       dpi=300, title=None, width=5, height=3, markup=False,
                       markupdict=None, ret=False, orig_nams=[],
-                      keep_numbers=False, force_numbers=False, palette="CBS"):
+                      keep_numbers=False, force_numbers=False, palette="CBS",
+                      sub_matrix_name='B', plot_identity_palette='terrain_r',
+                      plot_similarity_palette='summer_r',
+                      plot_substitution_matrix='B'):
     '''
     Draws a "mini alignment" image showing a small representation of the
     whole alignment so that gaps and poorly aligned regions are visible.
@@ -322,8 +325,15 @@ def drawMiniAlignment(arr, nams, log, outfile, typ, plot_type='standard',
     if plot_type == 'standard':
         arr2, cm = arrNumeric(arr, typ, palette)
     elif plot_type == 'boolean':
-        arr2 = consensusSeq.compareAlignmentConsensus(arr)
-        cm = matplotlib.colormaps['jet']
+        arr2 = consensusSeq.compareAlignmentConsensus(
+            arr, typ=typ, booleanOrSimilarity="Boolean")
+        cm = matplotlib.colormaps[plot_identity_palette]
+    elif plot_type == 'similarity':
+        arr2 = consensusSeq.compareAlignmentConsensus(
+            arr, typ=typ, booleanOrSimilarity="similarity",
+            MatrixName=sub_matrix_name)
+        cm = matplotlib.colormaps[plot_similarity_palette]
+
     # display it on the axis
     a.imshow(arr2, cmap=cm, aspect='auto', interpolation='nearest')