In [1]:
import sys
import os
import time
import queue
import numpy as np
import anytree

# Constants
MotifLen = 60  # maximum length of the candidate motifs
MaxSeqNum = 100  # maximum number of sequences in the datasets
MaxSeqLen = 5000  # maximum length of sequences in the datasets
SizeofAB = 4  # ACGT/acgt, size of alphabeta

# Declare variables
L = 15  # default motif length
d = 4  # default mutation allowed
d0 = 2 * d
nm = 3  # number of motifs to output, nm should be larger than 0; default as 3
N4 = 0  # number of vertices of length 4, N4 = L/4
Nrem = 0  # number of vertices of length rem(mainder), Nrem = L%4
HD = [[0]*4445 for _ in range(4445)]
RealSequenceNumber = 0  # the real number of sequences in corresponding file which will be changed after each call of function: ReadinData()
ConsensusCount = [[] for _ in range(SizeofAB)]
CONSENSUS = None  # length L
CONSENSUS2 = None  # length L
CONSENSUS3 = [[None]*L for _ in range(nm)]
string1 = [[None]*MaxSeqLen for _ in range(MaxSeqNum)]  # pointers to the sequence names of MaxSeqNum sequences with each name having a length of MaxFileNameLength
string2 = [[None]*MaxSeqLen for _ in range(MaxSeqNum)]  # pointers to the sequence contents of MaxSeqNum sequences with each content having a length of MaxSeqLen
string3 = [None]*100
Filename = [None]  # name of the data file
fp_tree = None  # pointer to file of tree
fp_time = None  # pointer to file of time
fp_consen = None  # pointer to file of OOPS cliques
fp_merge_clique = None  # pointer to file of EMOPS/MOOPs cliques
fp_standard = None  # pointer to file of standard format (to use web-tool)
isAppending_level_i = False  # if appending happened at a certain level
isValid_vt_r = False  # if the reference vertex is valid for tree construction
isSorted = False
isOOPS = False  # if the clique to find is exactly one occurrence per sequence, default: false
isEMOPS = True  # if the clique to find can be equal or more than one occurrence per sequence, default: true
isFIRSTtree = True  # if the tree is the first one for a reference combination
isOTree = False  # if the trees constructed need to be output
isStandard = False  # if the standard output file is in need
OutputFileFullName = None  # tree
RecanFileName = None  # OOPS cliques rescanned from the consensus
MergeFileName = None  # EMOPS/MOOPS cliques rescanned from the consensus
StardFileName = None  # standard output format according to the requirements by paper: "improved benchmark for computational motif discovery"
OutputTimeFileName = "TreeMotif_Time.txt"  # time
Vertex = None
str2 = None  # the re-translated string
new_leaf_node = None  # a new leaf node to be appended
startT, finishT = None, None
IC3 = None  # information content
int_4_Vertex = None  # vertex of size 4
int_rem_Vertex = None  # vertex of size L mod 4 (remainder)
idsVertex = [[None]*MaxSeqLen for _ in range(MaxSeqLen)]
i, i2, vt_r, vt_r2 = 0, 0, 0, 0  # sequence id and vertex id frequently used
seq_i, vposition_j, vt_j = 0, 0, 0
ref_seq, ref_seq2 = 0, 0  # sequence ids of references
STPNUM = None  # number of current found connections to the given reference vertex for each sequence 1 to m-1
STP = None  # real position in the sequence for the jth connection
STPNUM2 = None  # number of current found connections to the given reference vertex for each sequence 2 to m-1
STP2 = None  # real position in the sequence for the jth connection
STPNUM3 = None
STP3 = None
Dtemp = 0  # temporary variable of Hamming distance
rootid = 0  # id of node to be used as a tree root
treerootk = 0  # id of a tree root
checknodei = 0  # the node to be checked for appending onto a tree
temploc_depth = 0  # the depth of a node in the tree
num_diff_consensus = 0  # number of different consensuses have been found
num_rescan_consensus = 0  # number of different consensuses rescanned have been found
i_diff_clique = 0  # number of different EMOPS/MOOPS cliques: all motif instances have less than 50% percent positions in common
number_of_current_branch = 0  # number of OOPS cliques

class Sequence:                        # structure of a sequence
    def __init__(self):
        self.name = None               #
        self.content = None            #
        self.length = 0                #

Seq_temp = Sequence()
Seq = [Sequence() for _ in range(MaxSeqNum)]

class InstInfo:                       # information of a motif instance
    def __init__(self):
        self.startingP = 0             # starting position in a sequence
        self.string = ['']*MotifLen   # the substring

class MotifInfo:                      # structure of a motif
    def __init__(self):
        self.TREEID = 0                # tree   id
        self.BRANID = 0                # branch id
        self.INFO = 0.0                # information content
        self.INSTANCE = [InstInfo() for _ in range(MaxSeqNum)] #

Motif_temp = MotifInfo()
Motif_result = MotifInfo()

CONsensus = {}                         # hash_map<std::string, int>
Traversing_queue = []                  # queue<tree<char*>::iterator>
ti = None                              #
myTop2, myRoot2 = None, None          #
tb, te = None, None                   #
newtree_temploc = None                #
temploc = None                        #
post_itr = None                       #
child_beginloc, child_endloc = None, None  #
tree0_vt_r = [[] for _ in range(MaxSeqLen)]  #
consensus_tree = {}                     # the tree to store the consensuses



In [None]:
def CmdlineCheck(argv):
    if len(argv) < 2:
        print("\n********************************************************** USAGE **************************************************|")
        print("COMMAND: TreeMotif1.0 data.fas [Options]                                                                             |")
        print("   1. OPTIONS FORMAT: -param <meaning of the param> [default value].                                                 |")
        print("   |     1) PARAMS ON MOTIF: -L INT1 -d INT2 -nm INT3. CONSTRAINTS: INT1*(1/3)>=INT2>1, INT3>0.                      |")
        print("   |     |     -L  <length of motif to find> [15]                                                                    |")
        print("   |     |     -d  <maximum number of mutations on a consensus motif> [4]                                            |")
        print("   |     |     -nm <expected number of motifs to output> [3]                                                         |")
        print("   |     2) PARAMS ON DATA RESULTED: -toe 3-BIT-DIGIT, WHERE 3-BIT-DIGIT is in {000, 001, ..., 111}.                 |")
        print("   |     |      t  <output trees or not> [0]                                                                         |")
        print("   |     |      o  <output cliques as OOPS: one occurence of motif instance per sequence> [0]                        |")
        print("   |     |      e  <output cliques as EMOPS/MOOPS: more than one occurence of motif isntance per sequence> [1]       |")
        print("   |     |     -toe 000: output -------------------------------------------  execution time.                         |")
        print("   |     |     -toe 001: output -------------------------- MOOPS cliques and execution time.                         |")
        print("   |     |     -toe 010: output --------- OOPS cliques ----------------- and execution time.                         |")
        print("   |     |     -toe 011: output --------- OOPS cliques and MOOPS cliques and execution time.                         |")
        print("   |     |     -toe 100: output trees ---------------------------------- and execution time.                         |")
        print("   |     |     -toe 101: output trees ---------------- and MOOPS cliques and execution time.                         |")
        print("   |     |     -toe 110: output trees and OOPS cliques ----------------- and execution time.                         |")
        print("   |     |     -toe 111: output trees and OOPS cliques and MOOPS cliques and execution time.                         |")
        print("   |     3) PARAMS TO OBTAIN STANDARD OUTPUT FORMAT FOR RESULTS EVALUATION: -s [0]                                   |")
        print("   |           -s  <standard format: #Sequence, #start(inclusive), #end(inclusive), #nucleotides> [0]                |")
        print("   |                                                                                                                 |")
        print("   2. EXAMPLE: TreeMotif1.0 data.fas -L 15 -d 4 -nm 3 -toe 101 -s 0                                                  |")
        print("         1): produce 3 cliques of motif instances with motif length as 15 and maximum mutations as 4;                |")
        print("         2): output trees; no cliques of OOPS needed; output merged cliques;                                         |")
        print("         3): no standard files needed.                                                                               |")
        print("\n********************************************************** END of USAGE *******************************************|")
        sys.exit(1)

    ic = 2
    d_in = False
    while ic < len(argv):
        if argv[ic] in ("-l", "-L") and d_in:
            print("ERROR: Illegal command line argument. Motif length L must be input before d.")
            print("Please input again accordingly.")
            sys.exit(1)
        if argv[ic] in ("-l", "-L"):
            ic += 1
            if ic < len(argv):
                L = int(argv[ic])
                if L > MotifLen:
                    print("ERROR: Motif Length should be less than", MotifLen)
                    print("Maximum Motif Length can be reset using '#define MotifLen int_value'")
                    sys.exit(1)
            else:
                print("ERROR: Illegal or incomplete command line argument.")
                sys.exit(1)
        elif argv[ic] == "-d":
            ic += 1
            if ic < len(argv):
                d = int(argv[ic])
                if d > L/2:
                    print("ERROR: d must be less than", L/2, "(L/2);")
                    print("proper d can be less than 1/3*L.")
                    sys.exit(1)
                d_in = True
                d0 = 2*d
            else:
                print("ERROR: Illegal or incomplete command line argument.")
                sys.exit(1)
        elif argv[ic] == "-nm":
            ic += 1
            if ic < len(argv):
                nm = int(argv[ic])
                if nm <= 0:
                    print("ERROR: nm should be larger than 0.")
                    sys.exit(1)
            else:
                print("ERROR: Illegal or incomplete command line argument.")
                sys.exit(1)
        elif argv[ic] == "-toe":
            ic += 1
            if ic < len(argv):
                isOTree = (int(argv[ic])/100) & 1
                isOOPS = ((int(argv[ic]) % 100) / 10) & 1
                isEMOPS = (int(argv[ic]) % 10) & 1
            else:
                print("ERROR: Illegal or incomplete command line argument")
                sys.exit(1)
        elif argv[ic] == "-s":
            ic += 1
            if ic < len(argv):
                isStandard = int(argv[ic]) & 1
            else:
                print("ERROR: Illegal or incomplete command line argument")
                sys.exit(1)
        ic += 1

    Filename[0] = argv[1]


In [None]:
def ReadinData():
    real_seq_number_0 = 0

    try:
        with open(Filename[0], "r") as fp:
            while True:
                line1 = fp.readline().strip()
                line2 = fp.readline().strip()

                if not line1 or not line2:
                    break

                string1[real_seq_number_0] = line1
                string2[real_seq_number_0] = line2

                Seq[real_seq_number_0].name = line1
                Seq[real_seq_number_0].length = len(line2)
                Seq[real_seq_number_0].content = line2

                real_seq_number_0 += 1

    except FileNotFoundError:
        print("\n    Cannot open this data file!!!")
        if isOTree:
            with open(fp_tree, "a") as fpt:
                fpt.write("Cannot open this data file!!!\n")
        exit(0)

    real_sequence_number = real_seq_number_0

    if isOTree:
        with open(fp_tree, "a") as fpt:
            fpt.write(f"There are {real_seq_number_0} sequences in this file!\nTrees constructed:\n")

    # reverse the order of sequences
    for tempi in range(real_sequence_number // 2):
        seq_temp = Seq[tempi]
        Seq[tempi] = Seq[real_sequence_number - 1 - tempi]
        Seq[real_sequence_number - 1 - tempi] = seq_temp

In [None]:
def Prepare_variables(rmlos):

    STP = [[0] * (rmlos + 1) for _ in range(RealSequenceNumber + 1)]
    STPNUM = [0] * (RealSequenceNumber + 1)
    STP2 = [[0] * (rmlos + 1) for _ in range(RealSequenceNumber + 1)]
    STPNUM2 = [0] * (RealSequenceNumber + 1)
    STP3 = [[[0] * (rmlos + 1) for _ in range(RealSequenceNumber + 1)] for _ in range(nm + 1)]
    STPNUM3 = [[0] * (RealSequenceNumber + 1) for _ in range(nm + 1)]
    IC3 = [0.0] * (nm + 1)

    for ici in range(nm + 1):
        IC3[ici] = 0.0
    for ci in range(nm + 1):
        for si in range(RealSequenceNumber + 1):
            STP3[ci][si] = [0] * (rmlos + 1)

    for si in range(RealSequenceNumber + 1):
        STP[si] = [0] * (rmlos + 1)
        STP2[si] = [0] * (rmlos + 1)

    N4 = L // 4
    Nrem = L % 4

    idsVertex = []
    for j in range(MaxSeqLen):
        if Nrem > 0:
            idsVertex.append([j + 4 * (id + 1) for id in range(N4)])
        else:
            idsVertex.append([j + 4 * (id + 1) for id in range(N4 - 1)])

    if nm > 0 and isOOPS:
        Motif_result = [None] * nm
        for ni in range(nm):
            Motif_result[ni] = {'INFO': -100}
        CONSENSUS = [''] * L
        CONSENSUS2 = [''] * L

    ConsensusCount = [[0] * L for _ in range(SizeofAB)]

In [None]:
def HammingDistanceCalc(x1, x2):
    distance = len(x1)  # Initialize distance with the length of x1
    for char1, char2 in zip(x1, x2):  # Iterate over characters of x1 and x2 simultaneously
        if char1 == char2:
            distance -= 1
        if distance <= d:  # If distance is less than or equal to d, break the loop
            break
    return distance

In [None]:
def HammingDistanceCalc2(i1, j1, i2, j2):
    distance = L
    id_v = 0

    if Nrem == 0:
        id_str1 = int_4_Vertex[i1][j1]
        id_str2 = int_4_Vertex[i2][j2]
        distance -= HD[id_str1][id_str2]
        while id_v < N4 - 1:
            id_str1 = int_4_Vertex[i1][idsVertex[j1][id_v]]
            id_str2 = int_4_Vertex[i2][idsVertex[j2][id_v]]
            distance -= HD[id_str1][id_str2]
            if distance <= d0:
                break
            id_v += 1
    else:
        id_str1 = int_4_Vertex[i1][j1]
        id_str2 = int_4_Vertex[i2][j2]
        distance -= HD[id_str1][id_str2]
        while id_v < N4 - 1:
            id_str1 = int_4_Vertex[i1][idsVertex[j1][id_v]]
            id_str2 = int_4_Vertex[i2][idsVertex[j2][id_v]]
            distance -= HD[id_str1][id_str2]
            if distance <= d0:
                break
            id_v += 1
        if distance > d0:
            id_str1 = int_rem_Vertex[i1][idsVertex[j1][N4 - 1]]
            id_str2 = int_rem_Vertex[i2][idsVertex[j2][N4 - 1]]
            distance -= HD[id_str1][id_str2]

    return distance

In [None]:
def Find_vertices():
    Vertex = np.empty((RealSequenceNumber,), dtype=np.object)
    if Vertex is None:
        print("   Vertex malloc error.")
        exit(1)
    
    int_4_Vertex = np.empty((RealSequenceNumber,), dtype=np.object)
    if int_4_Vertex is None:
        print("   int_4_Vertex malloc error.")
        exit(1)
    
    if Nrem > 0:
        int_rem_Vertex = np.empty((RealSequenceNumber,), dtype=np.object)
        if int_rem_Vertex is None:
            print("   int_rem_Vertex malloc error.")
            exit(1)
    for seq_i in range(RealSequenceNumber):
        Vertex[seq_i] = [None] * Seq[seq_i].length
        for vposition_j in range(Seq[seq_i].length - L + 1):
            Vertex[seq_i][vposition_j] = [None] * L
    
            # vertex of length 4
        int_4_Vertex[seq_i] = [None] * Seq[seq_i].length
    
        # vertex of length rem
        if Nrem > 0:
            int_rem_Vertex[seq_i] = [None] * Seq[seq_i].length
    for seq_i in range(RealSequenceNumber):
    # length L
        for vposition_j in range(Seq[seq_i].length - L + 1):
            Vertex[seq_i][vposition_j] = Seq[seq_i].content[vposition_j:vposition_j+L]
    
        # length 4
        for vposition_j in range(Seq[seq_i].length - 4 + 1):
            temp4str = Seq[seq_i].content[vposition_j:vposition_j+4]
            int_4_Vertex[seq_i][vposition_j] = int(temp4str)
    
        if Nrem > 0:
            for vposition_j in range(Seq[seq_i].length - Nrem + 1):  # length rem
                tempremstr = Seq[seq_i].content[vposition_j:vposition_j+Nrem]
                int_rem_Vertex[seq_i][vposition_j] = int(tempremstr)

In [None]:
def Find_vt_r_connections_level_i():
    STPNUM[i] = 0
    for vt_j in range(Seq[i].length - L + 1):
        Dtemp = HammingDistanceCalc2(0, vt_r, i, vt_j)
        if Dtemp <= d0:
            STPNUM[i] += 1
            STP[i][STPNUM[i] - 1] = vt_j

In [None]:
def Find_vt_r_connections_level_i2():
    STPNUM2[i2] = 0
    for vt_j in range(STPNUM[i2]):
        Dtemp = HammingDistanceCalc2(1, STP[1][vt_r2], i2, STP[i2][vt_j])
        if Dtemp <= d0:
            STPNUM2[i2] += 1
            STP2[i2][STPNUM2[i2] - 1] = STP[i2][vt_j]

In [1]:
def Delete_illegal_branch():


In [None]:
def Check_vertices_level_i():

In [None]:
def Traverse_existed_tree():

In [None]:
def Traverse_children_of_a_node():

In [None]:
def Construct_trees():

In [None]:
def InitializeHD():
    # units, tens, hundreds, kilobits
    HD = [[0 for _ in range(5000)] for _ in range(5000)]
    
    # initialization
    # compute the distance between 1-bit strings
    for x in range(1, 5):
        for y in range(1, 5):
            if x == y:
                HD[x][y] = 1
            else:
                HD[x][y] = 0
    
    # compute the distances between 2-bit strings
    for x in range(11, 45):
        for y in range(11, 45):
            xus = x % 10
            xts = x // 10
            yus = y % 10
            yts = y // 10

            if 0 < xus <= 4 and 0 < xts <= 4 and 0 < yus <= 4 and 0 < yts <= 4 and xus * xts * yus * yts > 0:
                HD[x][y] = 2
                if xus != yus:
                    HD[x][y] -= 1
                if xts != yts:
                    HD[x][y] -= 1
    
    # compute the distances between 3-bit strings
    for x in range(111, 445):
        for y in range(111, 445):
            xus = x % 10
            xts = (x % 100) // 10
            xhs = x // 100
            yus = y % 10
            yts = (y % 100) // 10
            yhs = y // 100

            if 0 < xus <= 4 and 0 < xts <= 4 and 0 < xhs <= 4 and 0 < yus <= 4 and 0 < yts <= 4 and 0 < yhs <= 4 and xhs * xus * xts * yhs * yus * yts > 0:
                HD[x][y] = 3
                if xus != yus:
                    HD[x][y] -= 1
                if xts != yts:
                    HD[x][y] -= 1
                if xhs != yhs:
                    HD[x][y] -= 1
    
    # compute the distances between 4-bit strings
    for x in range(1111, 4445):
        for y in range(1111, 4445):
            xus = x % 10
            xts = (x % 100) // 10
            xhs = (x % 1000) // 100
            xks = x // 1000
            yus = y % 10
            yts = (y % 100) // 10
            yhs = (y % 1000) // 100
            yks = y // 1000

            if 0 < xus <= 4 and 0 < xts <= 4 and 0 < xhs <= 4 and 0 < xks <= 4 and 0 < yus <= 4 and 0 < yts <= 4 and 0 < yhs <= 4 and 0 < yks <= 4 and xks * xhs * xus * xts * yks * yhs * yus * yts > 0:
                HD[x][y] = 4
                if xus != yus:
                    HD[x][y] -= 1
                if xts != yts:
                    HD[x][y] -= 1
                if xhs != yhs:
                    HD[x][y] -= 1
                if xks != yks:
                    HD[x][y] -= 1

    return HD


In [None]:
def Translate2intSeq():
    for ri in range(RealSequenceNumber):
        for li in range(len(Seq[ri].content)):
            if Seq[ri].content[li].lower() == 'a':
                Seq[ri].content[li] = '1'
            elif Seq[ri].content[li].lower() == 'c':
                Seq[ri].content[li] = '2'
            elif Seq[ri].content[li].lower() == 'g':
                Seq[ri].content[li] = '3'
            elif Seq[ri].content[li].lower() == 't':
                Seq[ri].content[li] = '4'
            else:
                print(f"Illegal letter: {Seq[ri].content[li]} found in {Seq[ri].name[1:]} at position {li}!!!")
                exit(1)


In [None]:
def ReTranslate2charString(str):
    translated_string = ''
    for char in str:
        if char == '1':
            translated_string += 'a'
        elif char == '2':
            translated_string += 'c'
        elif char == '3':
            translated_string += 'g'
        elif char == '4':
            translated_string += 't'
        else:
            print("Illegal letters!!!")
            exit(1)
    return translated_string

In [None]:
def Check_clique():

In [None]:
def Base_Count(STP0):
    for pos in range(L):
        ConsensusCount[0][pos] = 0
        ConsensusCount[1][pos] = 0
        ConsensusCount[2][pos] = 0
        ConsensusCount[3][pos] = 0

    for seqi in range(RealSequenceNumber):
        stringx = Vertex[RealSequenceNumber - 1 - seqi][STP0[RealSequenceNumber - 1 - seqi]]
        stringx += '\0'  # Ensure the string ends with null terminator
        for pos in range(L):
            if stringx[pos] in ('a', '1'):
                ConsensusCount[0][pos] += 1
            if stringx[pos] in ('c', '2'):
                ConsensusCount[1][pos] += 1
            if stringx[pos] in ('g', '3'):
                ConsensusCount[2][pos] += 1
            if stringx[pos] in ('t', '4'):
                ConsensusCount[3][pos] += 1

In [None]:
def InfoContent_Calc(STP0):
    Base_Count(STP0)

    # Calculate the information content of the consensus matrix
    IC_allp = 0
    for cc in range(L):
        IC_singlep = 0
        for rc in range(SizeofAB):
            if ConsensusCount[rc][cc] > 0:
                IC_singlep += (ConsensusCount[rc][cc] / RealSequenceNumber) * math.log((ConsensusCount[rc][cc] / RealSequenceNumber), 2)
        IC_allp += 2 + IC_singlep
    
    return IC_allp

In [None]:
def Consensus_Calc():
    for ci in range(min(nm, i_diff_clique)):
        for pos in range(L):
            ConsensusCount = [0, 0, 0, 0]

        for seqi in range(RealSequenceNumber):
            for stpi in range(STPNUM3[ci][RealSequenceNumber - 1 - seqi]):
                stringx = Vertex[RealSequenceNumber - 1 - seqi][STP3[ci][RealSequenceNumber - 1 - seqi][stpi]]
                stringx += '\0'  # Ensure the string ends with null terminator
                for pos in range(L):
                    if stringx[pos] in ('a', '1'):
                        ConsensusCount[0] += 1
                    if stringx[pos] in ('c', '2'):
                        ConsensusCount[1] += 1
                    if stringx[pos] in ('g', '3'):
                        ConsensusCount[2] += 1
                    if stringx[pos] in ('t', '4'):
                        ConsensusCount[3] += 1

        for cc in range(L):
            CONSENSUS3[ci][cc] = None
            get_con_letter = False
            for rc, count in enumerate(ConsensusCount):
                if count > 0.5 * RealSequenceNumber:
                    if rc == 0:
                        CONSENSUS3[ci][cc] = 'a'
                    elif rc == 1:
                        CONSENSUS3[ci][cc] = 'c'
                    elif rc == 2:
                        CONSENSUS3[ci][cc] = 'g'
                    elif rc == 3:
                        CONSENSUS3[ci][cc] = 't'
                    get_con_letter = True
            if not get_con_letter:
                CONSENSUS3[ci][cc] = 'x'

        CONSENSUS3[ci][L] = '\0'

In [None]:
def Output_trees():

In [None]:
def SortMotifResult():
    for sorti in range(number_of_current_branch):
        maxv = Motif_result[sorti].INFO
        id_maxv = sorti
        for sortj in range(sorti, number_of_current_branch):
            if Motif_result[sortj].INFO > maxv:
                maxv = Motif_result[sortj].INFO
                id_maxv = sortj
        if sorti != id_maxv:
            Motif_temp = Motif_result[sorti]
            Motif_result[sorti] = Motif_result[id_maxv]
            Motif_result[id_maxv] = Motif_temp

In [None]:
def Output_cliques():

In [None]:
def Eliminate_Cinstance(STPNUMc, STPc, RealSeqNum):
    realc = min(nm, i_diff_clique)
    for idc in range(realc):
        for ise in range(1, RealSeqNum + 1):
            for isp in range(STPNUMc[idc][RealSeqNum - ise]):
                stringx = Vertex[RealSeqNum - ise][STPc[idc][RealSeqNum - ise][isp]]
                stringx += '\0'  # Ensure the string ends with null terminator
                ReTranslate2charString(stringx)
                str2[L] = '\0'

                Distance0 = L
                k = 0
                while k < L:
                    if CONSENSUS3[idc][k] == 'x' or str2[k] == 'x':
                        Distance0 -= 1
                    elif CONSENSUS3[idc][k] == str2[k]:
                        Distance0 -= 1
                    if Distance0 <= d:
                        break
                    else:
                        k += 1

                if Distance0 > d:
                    movei = isp
                    while movei < STPNUMc[idc][RealSeqNum - ise] - 1:
                        STPc[idc][RealSeqNum - ise][movei] = STPc[idc][RealSeqNum - ise][movei + 1]
                        movei += 1
                    if isp == STPNUMc[idc][RealSeqNum - ise] - 1:
                        pass
                    else:
                        isp -= 1
                    STPNUMc[idc][RealSeqNum - ise] -= 1
                    if STPNUMc[idc][RealSeqNum - ise] == 0:
                        for recheckj in range(RealSeqNum - ise, len(Seq[RealSeqNum - ise]) - L + 1):
                            stringy = Vertex[RealSeqNum - ise][recheckj]
                            stringy += '\0'  # Ensure the string ends with null terminator
                            ReTranslate2charString(stringy)

                            Distance0 = L
                            k = 0
                            while k < L:
                                if CONSENSUS3[idc][k] == 'x' or str2[k] == 'x':
                                    Distance0 -= 1
                                elif CONSENSUS3[idc][k] == str2[k]:
                                    Distance0 -= 1
                                if Distance0 <= d:
                                    break
                                k += 1

                            if Distance0 <= d:
                                STPc[idc][RealSeqNum - ise].append(recheckj)
                                STPNUMc[idc][RealSeqNum - ise] += 1
                    if STPNUMc[idc][RealSeqNum - ise] == 0:
                        STPNUMc[idc][RealSeqNum - ise] += 1

In [None]:
def Output_Merged_Cliques():
    realc = min(nm, i_diff_clique)
    for idc in range(realc):
        if idc == 0:
            fp_merge_clique.write("MOTIF INFO_%.4f_bits_%s length %d_bp\n" % (IC3[idc], CONSENSUS3[idc], L))
        else:
            fp_merge_clique.write("\n\nMOTIF INFO_%.4f_bits_%s length %d_bp\n" % (IC3[idc], CONSENSUS3[idc], L))
        for ise in range(1, RealSequenceNumber + 1):
            for isp in range(STPNUM3[idc][RealSequenceNumber - ise]):
                fp_merge_clique.write("\n    %-16s " % (Seq[RealSequenceNumber - ise].name + 1))
                stringx = Vertex[RealSequenceNumber - ise][STP3[idc][RealSequenceNumber - ise][isp]]
                stringx += '\0'  # Ensure the string ends with null terminator
                ReTranslate2charString(stringx)
                fp_merge_clique.write("%5d   %s" % (STP3[idc][RealSequenceNumber - ise][isp], str2))

In [None]:
def Output_Standard_Cliques():
    realc = min(nm, i_diff_clique)

    for idc in range(realc):
        for ise in range(1, RealSequenceNumber + 1):
            for isp in range(STPNUM3[idc][RealSequenceNumber - ise]):
                if idc == 0 and ise == 1 and isp == 0:
                    fp_standard.write("%s, " % (Seq[RealSequenceNumber - ise].name + 1))
                else:
                    fp_standard.write("\n%s, " % (Seq[RealSequenceNumber - ise].name + 1))

                stringx = Vertex[RealSequenceNumber - ise][STP3[idc][RealSequenceNumber - ise][isp]]
                stringx += '\0'  # Ensure the string ends with null terminator
                ReTranslate2charString(stringx)
                fp_standard.write("%d, %d, %s" % (STP3[idc][RealSequenceNumber - ise][isp] + 1, STP3[idc][RealSequenceNumber - ise][isp] + L, str2))

In [None]:
def Output_rescan_cliques():
    if num_diff_consensus > 0 and nm > 0:
        mapb = CONsensus.items().__iter__()
        while True:
            try:
                consensus_key, _ = next(mapb)
                tmp_consensus_str = consensus_key[:L]
                instance_range = 0
                
                for instancei in range(RealSequenceNumber - 1, -1, -1):
                    found_instance_in_seq = False
                    STPNUM2[instancei] = 0
                    
                    for vj in range(STPNUM[instancei]):
                        ReTranslate2charString(Vertex[instancei][STP[instancei][vj]])
                        tmp_dis = HammingDistanceCalc(tmp_consensus_str, str2)
                        
                        if tmp_dis <= d:
                            if not found_instance_in_seq:
                                found_instance_in_seq = True
                                
                            STPNUM2[instancei] += 1
                            STP2[instancei][STPNUM2[instancei] - 1] = STP[instancei][vj]
                    
                    if found_instance_in_seq:
                        instance_range += 1
                    else:
                        break  # no need to check the following sequences
                
                if instance_range >= RealSequenceNumber:
                    global num_rescan_consensus
                    num_rescan_consensus += 1
                    fp_consen.write("\nMOTIF %d/%d_%s length  %d_bp\n\n" % (num_rescan_consensus + 1, num_diff_consensus, consensus_key, L))
                    
                    for instancei in range(RealSequenceNumber - 1, -1, -1):
                        for vj in range(STPNUM2[instancei]):
                            ReTranslate2charString(Vertex[instancei][STP2[instancei][vj]])
                            fp_consen.write("    %15s%6d   %s\n" % (Seq[instancei].name, STP2[instancei][vj], str2))
                
            except StopIteration:
                break
        
        print("      %d clique(s) have been obtained by rescanning (size >= %d)." % (num_rescan_consensus, RealSequenceNumber))

In [None]:
def main(argv):

    # read in parameters from the cmd line
    CmdlineCheck(argv)

    startT = time.time()

    # file to record execution time
    fp_time = open(OutputTimeFileName, "a+")
    if fp_time is None:
        print("Cannot open file to write in running time!!!")
        exit(1)

    # file to record trees
    if isOTree:
        print("\nBegin searching trees for file: %s, " % Filename[0])
        OutputFileFullName = "Tree_of_" + Filename[0]
        fp_tree = open(OutputFileFullName, "w")
        if fp_tree is None:
            print("Cannot open output file!!!")
            exit(1)

    # file to record cliques of size == m
    if isOOPS:
        RecanFileName = "R_of_" + Filename[0]
        fp_consen = open(RecanFileName, "w")
        if fp_consen is None:
            print("Cannot open output file!!!")
            exit(1)

    # file to record cliques of size >= m
    if isEMOPS:
        MergeFileName = "R_MC_of_" + Filename[0]  # results of merged clique
        fp_merge_clique = open(MergeFileName, "w")
        if fp_merge_clique is None:
            print("Cannot open output file!!!")
            exit(1)
        CONSENSUS3 = np.empty((nm + 1, L), dtype=str)

    # file to record cliques with required format
    if isStandard:
        k = 0
        while Filename[0][k] != '.':
            StardFileName[k] = Filename[0][k]
            k += 1
        StardFileName = StardFileName + "_pred.txt"
        fp_standard = open(StardFileName, "w")
        if fp_standard is None:
            print("Cannot open output file!!!")
            exit(1)

    # read in data from the data file
    RealSequenceNumber = 0
    ReadinData()

    # translate the data into forms of 1234
    Translate2intSeq()

    # prepare related variables
    Prepare_variables(MaxSeqLen)

    # prepare the distance array
    InitializeHD()

    # set the first two sequences as the reference sequences
    ref_seq = 0
    ref_seq2 = 1
    print("Selected reference sequences are %s and %s...\n" % (Seq[ref_seq].name[1:], Seq[ref_seq2].name[1:]))

    # find all l-mer substrings using a sliding window
    Find_vertices()

    # start tree construction
    isSorted = False
    STPNUM[0] = Seq[0].length - L + 1
    num_rescan_consensus = 0
    for vt_r in range(STPNUM[0]):
        STP[0][vt_r] = vt_r
        isValid_vt_r = True
        for i in range(1, RealSequenceNumber):
            Find_vt_r_connections_level_i()
            if STPNUM[i] == 0:
                isValid_vt_r = False
                break
        if isValid_vt_r:
            Construct_trees()
        else:
            continue

        '''
        # output cliques with rescanning for motif instances using consensus motifs
        if fp_consen:
            # not used in this version
            Output_rescan_cliques()
        '''

    # output results of cliques
    if isStandard:
        # output cliques under standard format
        Output_Standard_Cliques()
    if isEMOPS:
        # get the consensuses
        Consensus_Calc()

        # eliminate instances
        Eliminate_Cinstance(STPNUM3, STP3, RealSequenceNumber)

        # output the merged cliques
        Output_Merged_Cliques()
        print("   Total number of different cliques found: %d; expected: %d.\n" % (i_diff_clique, nm))
    if isOOPS:
        # output the cliques of size m
        Output_cliques()

    # output execution time
    finishT = time.time()
    print("   Total time for the file %s: %.4f seconds.\n" % (Filename[0], (finishT - startT) / 1000))
    fp_time.write("%s_L%d_d%d_nm%d: %.4f s\n" % (Filename[0], L, d, nm, (finishT - startT) / 1000))
    if isOTree:
        fp_tree.write("Total time for the file %s: %.4f seconds.\n" % (Filename[0], (finishT - startT) / 1000))

    
    if nm > 0 and isOOPS:
        fp_consen.close()  # cliques of size m
    if isOTree:
        fp_tree.close()  # tree
    if isEMOPS:
        for ci in range(nm + 1):
            CONSENSUS3[ci] = None
            fp_merge_clique.close()  # cliques of size>=m, because of merging
    if isStandard:
        fp_standard.close()
    fp_time.close()
    return 0

# Main function
if __name__ == "__main__":
    main(sys.argv)