incorporating new similarity matrix plotting function

KatyBrown · May 16, 2024 · 9dca5d9 · 9dca5d9
1 parent 67c139c
commit 9dca5d9
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 39 deletions.
diff --git a/CIAlign/argP.py b/CIAlign/argP.py
@@ -516,11 +516,30 @@ def getParser():
                  action="store_true",
                  help="""Plot a mini alignment showing positions which are \
                  identical to or differ from the consensus. \
-                 Also has the option to plot mini alignment showing \
-                 positions based on their numerical values when \
-                 when compared via a matrix to the consensus. \
                  Default: %(default)s""")
 
+    optional.add("--plot_consensus_similarity",
+                  dest='plot_consensus_similarity', action='store_true',
+                 help="""Plot mini alignment showing \
+                 positions based on their score when \
+                 when compared via a substitution matrix to the consensus. \
+                 Default: %(default)s""")
+
+    optional.add("--plot_substitution_matrix", dest='plot_substitution_matrix',
+                 type=str, default='B',
+                 help="""Substitution matrix to use for similarity plots. \
+                 Default: %(default)s""")
+
+    optional.add("--plot_identity_palette", dest='plot_identity_palette',
+                 type=str, default='terrain_r',
+                 help="""Matplotlib palette name for identity mini alignments. \
+                 Default: %(default)s""")
+
+    optional.add("--plot_similarity_palette", dest='plot_similarity_palette',
+                 type=str, default='summer_r',
+                 help="""Matplotlib palette name for similarity mini \
+                 alignments. Default: %(default)s""")
+
     optional.add("--plot_dpi", dest="plot_dpi",
                  type=int, default=300, metavar="(int)",
                  help="DPI for mini alignments. Default: %(default)s")

diff --git a/CIAlign/consensusSeq.py b/CIAlign/consensusSeq.py
@@ -18,6 +18,7 @@
 import scipy.stats
 import copy
 import operator
+import pandas as pd
 matplotlib.use('Agg')
 
 
@@ -694,79 +695,105 @@ def calcConservationAli(alignment, typ):
     return (heights, ents)
 
 
-def compareAlignmentConsensus(arr, typ, booleanOrSimilarity="Boolean", MatrixName="B"):
-    consensus, _ = np.array(findConsensus(arr, '', consensus_type='majority_nongap')  )
+def compareAlignmentConsensus(arr, typ, booleanOrSimilarity="Boolean",
+                              MatrixName="B"):
+    consensus, _ = np.array(findConsensus(arr, '',
+                                          consensus_type='majority_nongap')  )
+    ci_dir = os.path.dirname(utilityFunctions.__file__)
+    matrix_dir = "%s/similarity_matrices" % (ci_dir)
+
     if booleanOrSimilarity == "Boolean":
         '''
-        Compares the alignment of the inputted array to the consensus of that array, and outputs a boolean array.
+        Compares the alignment of the inputted array to the consensus of that
+        array, and outputs a boolean array.
 
         alignment: arr
           The alignment stored as a numpy array
 
         return:
-        a numpy array stored as new_arr, which is a boolean array comparing the arr to the consensus of it.
+        a numpy array stored as new_arr, which is a boolean array comparing the
+        arr to the consensus of it.
         '''
         bool_array = np.array([])
         bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
         # declares the numpy arrays
         for e in range(1, (len(arr[:,0])+1)):
             # iterates over the rows of the sequences
             z = e-1
-        for i in range(1, (len(arr[0,:])+1)):
-            # iterates over the columns of the sequences
-            x = i-1
-            if arr[z,x] == consensus[x]:
-                # verifies if the current value being iterated is equal to the equivalent value inline with the consensus
-                bool_array = np.append(bool_array, [True], axis=None)
-            else:
-                bool_array = np.append(bool_array, [False], axis=None)
-        bool_arrL = np.vstack([bool_arrL, bool_array])
-        bool_array = np.array([])
+            for i in range(1, (len(arr[0,:])+1)):
+                # iterates over the columns of the sequences
+                x = i-1
+                if arr[z,x] == consensus[x]:
+                    # verifies if the current value being iterated is equal to the
+                    #equivalent value inline with the consensus
+                    bool_array = np.append(bool_array, [True], axis=None)
+                else:
+                    bool_array = np.append(bool_array, [False], axis=None)
+            bool_arrL = np.vstack([bool_arrL, bool_array])
+            bool_array = np.array([])
+
         new_arr = copy.deepcopy(bool_arrL)
         new_arr = bool_arrL.astype(bool)
-        # returns the new boolean array containing the verified alignment to the consensus
+        # returns the new boolean array containing the verified alignment to
+        #the consensus
         return new_arr
     else:
         # generates the consensus
         Sarray = np.array([])
         SarrL = np.empty(dtype=int, shape=(0, len(consensus)))
         # declares the numpy arrays
-        tab = pd.read_csv("roman_work_experience/matrices.txt", sep="\t", index_col=0)
+        tab = pd.read_csv("%s/matrices.txt" % ci_dir, sep="\t", index_col=0)
         if typ == "aa":
           # verifies if the typ is amino acid or nucleotide
           if MatrixName != "B":
             if tab.loc[MatrixName][0] != typ:
               raise RuntimeError("This matrix is not valid")
               # verifies if the matrix is valid
             else:
-              # verifies if the user would like to use the default matrix or their own
-              mat = pd.read_csv(("%s/similarity_matrices/"+MatrixName) % mydir, comment="#", sep="\s+")
+              # verifies if the user would like to use the default matrix or
+              #their own
+              mat = pd.read_csv("%s/%s" % (matrix_dir, MatrixName),
+                                comment="#", sep="\s+")
           elif MatrixName == "B":
-            mat = pd.read_csv("%s/similarity_matrices/BLOSUM62" % mydir, comment="#", sep="\s+")
+            mat = pd.read_csv("%s/BLOSUM62" % (matrix_dir),
+                              comment="#", sep="\s+")
         elif typ == "nt":
             if MatrixName != "B":
                 if tab.loc[MatrixName][0] != typ:
                     raise RuntimeError("This matrix is not valid")
                     # verifies if the matrix is valid
                 else:
-                    # verifies if the user would like to use the default matrix or their own
-                    mat = pd.read_csv(("%s/similarity_matrices/"+MatrixName) % mydir, comment="#", sep="\s+")
+                    # verifies if the user would like to use the default
+                    # matrix or their own
+                    mat = pd.read_csv("%s/%s" % (matrix_dir, MatrixName),
+                                      comment="#", sep="\s+")
             elif MatrixName == "B":
-                mat = pd.read_csv("%s/similarity_matrices/NUC.4.4" % mydir, comment="#", sep="\s+")
+                mat = pd.read_csv("%s/NUC.4.4" % (matrix_dir), comment="#",
+                                  sep="\s+")
         for e in range(1, (len(arr[:,0])+1)):
             # iterates over the rows of the sequences
             z = e-1
             for i in range(1, (len(arr[0,:])+1)):
                 #  iterates over the columns of the sequences
                 x = i-1
-                if not arr[z,x] == "-":
-                      Sarray = np.append(Sarray,[int(mat.loc[arr[z,x],consensus[x]])])
+                if not arr[z, x] == "-":
+                    if arr[z, x] == "U" and typ == 'nt':
+                        thischar = "T"
+                    else:
+                        thischar = arr[z, x]
+                    if consensus[x] == "U" and typ == 'nt':
+                        conschar = "T"
+                    else:
+                        conschar = consensus[x]
+                    score = mat.loc[thischar, conschar]
+                    Sarray = np.append(Sarray, [score])
                 elif arr[z,x] == "-":
                       # sets the value of '-' as 0
                       Sarray = np.append(Sarray, 0)
             SarrL = np.vstack([SarrL, Sarray])
             Sarray = np.array([])
         new_Sarr = copy.deepcopy(SarrL)
         new_Sarr = SarrL.astype(int)
-        # returns the new similarity array containing the verified alignment to the consensus
+        # returns the new similarity array containing the verified alignment
+        # to the consensus
         return new_Sarr
diff --git a/CIAlign/miniAlignments.py b/CIAlign/miniAlignments.py
@@ -250,7 +250,10 @@ def drawMarkUpLegend(outfile, palette="CBS"):
 def drawMiniAlignment(arr, nams, log, outfile, typ, plot_type='standard',
                       dpi=300, title=None, width=5, height=3, markup=False,
                       markupdict=None, ret=False, orig_nams=[],
-                      keep_numbers=False, force_numbers=False, palette="CBS"):
+                      keep_numbers=False, force_numbers=False, palette="CBS",
+                      sub_matrix_name='B', plot_identity_palette='terrain_r',
+                      plot_similarity_palette='summer_r',
+                      plot_substitution_matrix='B'):
     '''
     Draws a "mini alignment" image showing a small representation of the
     whole alignment so that gaps and poorly aligned regions are visible.
@@ -322,8 +325,15 @@ def drawMiniAlignment(arr, nams, log, outfile, typ, plot_type='standard',
     if plot_type == 'standard':
         arr2, cm = arrNumeric(arr, typ, palette)
     elif plot_type == 'boolean':
-        arr2 = consensusSeq.compareAlignmentConsensus(arr)
-        cm = matplotlib.colormaps['jet']
+        arr2 = consensusSeq.compareAlignmentConsensus(
+            arr, typ=typ, booleanOrSimilarity="Boolean")
+        cm = matplotlib.colormaps[plot_identity_palette]
+    elif plot_type == 'similarity':
+        arr2 = consensusSeq.compareAlignmentConsensus(
+            arr, typ=typ, booleanOrSimilarity="similarity",
+            MatrixName=sub_matrix_name)
+        cm = matplotlib.colormaps[plot_similarity_palette]
+
     # display it on the axis
     a.imshow(arr2, cmap=cm, aspect='auto', interpolation='nearest')
 

diff --git a/CIAlign/runCIAlign.py b/CIAlign/runCIAlign.py
@@ -162,6 +162,7 @@ def whichFunctions(args):
             args.plot_output,
             args.plot_markup,
             args.plot_consensus_identity,
+            args.plot_consensus_similarity,
             args.visualise,
             args.all_options]):
         which_functions.append("mini_alignments")
@@ -892,14 +893,33 @@ def runMiniAlignments(args, log, orig_arr, orig_nams, arr, nams,
             print("Plotting identity to consensus")
         outf = "%s_consensus_identity.%s" % (args.outfile_stem,
                                              args.plot_format)
-        miniAlignments.drawMiniAlignment(arr, nams, log,
-                                         outf, typ,
-                                         plot_type='boolean',
-                                         dpi=args.plot_dpi,
-                                         width=args.plot_width,
-                                         height=args.plot_height,
-                                         force_numbers=fn,
-                                         palette=args.palette)
+        miniAlignments.drawMiniAlignment(
+            arr, nams, log,
+            outf, typ,
+            plot_type='boolean',
+            dpi=args.plot_dpi,
+            width=args.plot_width,
+            height=args.plot_height,
+            force_numbers=fn,
+            palette=args.palette,
+            plot_identity_palette=args.plot_identity_palette)
+    if args.plot_consensus_similarity:
+        log.info("Plotting similarity to consensus")
+        if not args.silent:
+            print("Plotting similarity to consensus")
+        outf = "%s_consensus_similarity.%s" % (args.outfile_stem,
+                                             args.plot_format)
+        miniAlignments.drawMiniAlignment(
+            arr, nams, log,
+            outf, typ,
+            plot_type='similarity',
+            dpi=args.plot_dpi,
+            width=args.plot_width,
+            height=args.plot_height,
+            force_numbers=fn,
+            palette=args.palette,
+            plot_similarity_palette=args.plot_similarity_palette,
+            plot_substitution_matrix=args.plot_substitution_matrix)
 
 
 def runConsensus(args, log, orig_arr, orig_nams, arr, nams, removed_seqs):