pasted new function with similarity array

KatyBrown · May 16, 2024 · 74de398 · 74de398
1 parent 94e20e7
commit 74de398
Showing 1 changed file with 71 additions and 33 deletions.
diff --git a/CIAlign/consensusSeq.py b/CIAlign/consensusSeq.py
@@ -16,6 +16,8 @@
     import utilityFunctions
 import os
 import scipy.stats
+import copy
+import operator
 matplotlib.use('Agg')
 
 
@@ -692,43 +694,79 @@ def calcConservationAli(alignment, typ):
     return (heights, ents)
 
 
-def compareAlignmentConsensus(arr):
-    '''
-    Compares the alignment of the input array to the consensus of that array,
-    and outputs a boolean array.
+def compareAlignmentConsensus(arr, typ, booleanOrSimilarity="Boolean", MatrixName="B"):
+    consensus, _ = np.array(findConsensus(arr, '', consensus_type='majority_nongap')  )
+    if booleanOrSimilarity == "Boolean":
+        '''
+        Compares the alignment of the inputted array to the consensus of that array, and outputs a boolean array.
 
-    Parameters
-    ----------
-    alignment: np.array
-        The alignment stored as a numpy array
+        alignment: arr
+          The alignment stored as a numpy array
 
-    Returns
-    -------
-    A numpy array stored as new_arr, which is a boolean array
-    comparing the arr to its consensus.
-    '''
-    consensus, _ = np.array(findConsensus(arr, '',
-                                          consensus_type='majority_nongap'))
-    bool_array = np.array([])
-    bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
-    # declares the numpy arrays
-    for e in range(1, (len(arr[:, 0])+1)):
-        # iterates over the rows of the sequences
-        z = e - 1
-        for i in range(1, (len(arr[0, :])+1)):
+        return:
+        a numpy array stored as new_arr, which is a boolean array comparing the arr to the consensus of it.
+        '''
+        bool_array = np.array([])
+        bool_arrL = np.empty(dtype=bool, shape=(0, len(consensus)))
+        # declares the numpy arrays
+        for e in range(1, (len(arr[:,0])+1)):
+            # iterates over the rows of the sequences
+            z = e-1
+        for i in range(1, (len(arr[0,:])+1)):
             # iterates over the columns of the sequences
-            x = i - 1
-            if arr[z, x] == consensus[x]:
-                # verifies if the current value being iterated is equal to
-                # the equivalent value inline with the consensus
+            x = i-1
+            if arr[z,x] == consensus[x]:
+                # verifies if the current value being iterated is equal to the equivalent value inline with the consensus
                 bool_array = np.append(bool_array, [True], axis=None)
             else:
                 bool_array = np.append(bool_array, [False], axis=None)
-        bool_arrL = np.vstack([bool_arrL,
-                               bool_array])
+        bool_arrL = np.vstack([bool_arrL, bool_array])
         bool_array = np.array([])
-    new_arr = copy.deepcopy(bool_arrL)
-    new_arr = bool_arrL.astype(bool)
-    # returns the new boolean array containing the verified alignment
-    # to the consensus
-    return new_arr
+        new_arr = copy.deepcopy(bool_arrL)
+        new_arr = bool_arrL.astype(bool)
+        # returns the new boolean array containing the verified alignment to the consensus
+        return new_arr
+    else:
+        # generates the consensus
+        Sarray = np.array([])
+        SarrL = np.empty(dtype=int, shape=(0, len(consensus)))
+        # declares the numpy arrays
+        tab = pd.read_csv("roman_work_experience/matrices.txt", sep="\t", index_col=0)
+        if typ == "aa":
+          # verifies if the typ is amino acid or nucleotide
+          if MatrixName != "B":
+            if tab.loc[MatrixName][0] != typ:
+              raise RuntimeError("This matrix is not valid")
+              # verifies if the matrix is valid
+            else:
+              # verifies if the user would like to use the default matrix or their own
+              mat = pd.read_csv(("%s/similarity_matrices/"+MatrixName) % mydir, comment="#", sep="\s+")
+          elif MatrixName == "B":
+            mat = pd.read_csv("%s/similarity_matrices/BLOSUM62" % mydir, comment="#", sep="\s+")
+        elif typ == "nt":
+            if MatrixName != "B":
+                if tab.loc[MatrixName][0] != typ:
+                    raise RuntimeError("This matrix is not valid")
+                    # verifies if the matrix is valid
+                else:
+                    # verifies if the user would like to use the default matrix or their own
+                    mat = pd.read_csv(("%s/similarity_matrices/"+MatrixName) % mydir, comment="#", sep="\s+")
+            elif MatrixName == "B":
+                mat = pd.read_csv("%s/similarity_matrices/NUC.4.4" % mydir, comment="#", sep="\s+")
+        for e in range(1, (len(arr[:,0])+1)):
+            # iterates over the rows of the sequences
+            z = e-1
+            for i in range(1, (len(arr[0,:])+1)):
+                #  iterates over the columns of the sequences
+                x = i-1
+                if not arr[z,x] == "-":
+                      Sarray = np.append(Sarray,[int(mat.loc[arr[z,x],consensus[x]])])
+                elif arr[z,x] == "-":
+                      # sets the value of '-' as 0
+                      Sarray = np.append(Sarray, 0)
+            SarrL = np.vstack([SarrL, Sarray])
+            Sarray = np.array([])
+        new_Sarr = copy.deepcopy(SarrL)
+        new_Sarr = SarrL.astype(int)
+        # returns the new similarity array containing the verified alignment to the consensus
+        return new_Sarr