# GOR III secondary structure prediction
Name: Yanfang Guo   
Department : MACS   
Email: Yanfang.Guo@vub.be

## 1.Some preprocessing methods

- amino_dict{}, it is used to translate, such as Aly --> A,Cys --> C
- struc_dict{}, it is used to translate, such as Coli -->C
- pre_process(), translate the dssp_info.txt,stride_info.txt using the dictionary
    - for some amino acids which are not in the 20 normal amino acids, translate it into '?'
- read_txt(), read a txt file and return a 2-dimension list.

In [1]:
f = open("./txtfile/amino_dict.txt")
amino_dict = {}
for i in f.readlines():
    temp = i.strip().split()
    amino_dict[temp[0].upper()] = temp[1]
f.close()



f = open("./txtfile/stru_dict.txt")
stru_dict = {}
for i in f.readlines():
    temp = i.strip().split()
    stru_dict[temp[0].upper()] = temp[1]
f.close()

amino = []
for i in amino_dict.keys():
    amino.append(amino_dict[i])
amino.insert(0,"?")

def pre_process(file, newfile):
    '''
    do preprocessing, only for dssp_info and stride_info
    do with uncertain values
    :param list_2: 2-d list
    :return: 1-d list
    '''
    list_2 = read_txt(file)

    fw = open(newfile, 'w')
    for i in list_2:
        if i[3].upper() in amino_dict.keys():
            i[3] = amino_dict[i[3].upper()]
        else:
            i[3] = "?"
        if i[4].upper() in stru_dict.keys():
            i[4] = stru_dict[i[4].upper()]
        s = ''
        for j in i:
            s += j + ' '
        s += '\n'
        fw.write(s)

    return list_2

In [2]:
def read_txt(filename):
    '''
    :use to read structured txt file seperated with ' '
    :param filename: the position of the file
    :return: a two demension list
    '''
    f = open(filename)
    result = []
    for i in f.readlines():
        temp = i.strip().split()
        result.append(temp)
    f.close()
    return result

## 2. Calculate fSR
![](https://ww2.sinaimg.cn/large/006tNc79gy1ferkelr9yjj30je03w3ye.jpg)


In [3]:
from numpy import log
from numpy import sqrt


def fSR(list_2):
    '''
    calculate the fSR used in the GOR III Aalgorithm
    :param list_2:  list from the read_txt(filename)
    :return: a 2 d dictionary
    '''
    result = {}
    fS = {}
    fR = {}

    # initialize result
    for i in ['C','E','H']:
        temp ={}
        for j in amino:
            # user the 0.001 to avoid the divide by 0 issue
            temp[j] =0.001
        result[i] = temp

    # initialize fs

    for i in ['C', 'E', 'H']:
        fS[i] =0.001
    for i in amino:
        fR[i] = 0.001

    for i in list_2:
        # calculate fSR
        result[i[4]][i[3]] +=1
        fS[i[4]] +=1
        fR[i[3]] +=1

    return result, fS, fR

In [4]:
dssp2 = pre_process("./txtfile/dssp_info.txt", "./txtfile/dssp.txt")
stride = pre_process("./txtfile/stride_info.txt", "./txtfile/stride.txt")

fsr, fs, fr = fSR(dssp2)

# fsr2 is the parameter of stride
fsr2, fs2, fr2 = fSR(stride)

total = fs['C'] + fs['E'] + fs['C']
total2 = fs2['C'] + fs2['E'] + fs2['C']

## Calculate fSRM
- fsrm is a 3\*20\*20\*8 dictionary
- the initial value is 0.001, in order to solve the divided by 0 issue.
![](https://ww4.sinaimg.cn/large/006tNc79gy1ferkenz2ihj30jk09st8u.jpg)

In [5]:
f = open("./txtfile/dssp_protein")

list = f.readlines()
list2 = []
for i in range(0, len(list), 3):
    temp = []
    temp.append(list[i + 1].strip())
    temp.append(list[i + 2].strip())
    temp.append(list[i].strip())
    list2.append(temp)
f.close()

# dealing the statistics
fsrm = {}

# initialize
for i in ['C', 'E', 'H']:
    temp3 = {}
    for j in amino:
        temp2 = {}
        for k in amino:
            temp = {}
            for n in range(1, 9):
                temp[n] = 0.001

            temp2[k] = temp

        temp3[j] = temp2
    fsrm[i] = temp3

for i in list2:
    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm[i[1][j]][i[0][j]][i[0][k + j]][k] += 1

## GOR III algorithm

In [6]:
# gor3 algorithm use the dssp file
def gor3(alist):
    result = []
    for i in range(len(alist)):

        helix = log(fsr['H'][alist[i]] / (fsr['C'][alist[i]] + fsr['E'][alist[i]])) + log(
            (fs['E'] + fs['C']) / fs['H'])
        coil = log(fsr['C'][alist[i]] / (fsr['H'][alist[i]] + fsr['E'][alist[i]])) + log(
            (fs['H'] + fs['E']) / fs['C'])
        sheet = log(fsr['E'][alist[i]] / (fsr['C'][alist[i]] + fsr['H'][alist[i]])) + log(
            (fs['H'] + fs['C']) / fs['E'])

        for j in range(-8, 9):
            t = i + j
            if t > 0 and t < len(alist):
                if j != 0:
                    helix += log(fsrm['H'][alist[i]][alist[t]][abs(j)] / (
                        fsrm['E'][alist[i]][alist[t]][abs(j)] + fsrm['C'][alist[i]][alist[t]][abs(j)])) + log(
                        (fsr['E'][alist[i]] + fsr['C'][alist[i]]) / fsr['H'][alist[i]])

                    coil += log(fsrm['C'][alist[i]][alist[t]][abs(j)] / (
                        fsrm['E'][alist[i]][alist[t]][abs(j)] + fsrm['H'][alist[i]][alist[t]][abs(j)])) + log(
                        (fsr['E'][alist[i]] + fsr['H'][alist[i]]) / fsr['C'][alist[i]])
                    sheet += log(fsrm['E'][alist[i]][alist[t]][abs(j)] / (
                        fsrm['H'][alist[i]][alist[t]][abs(j)] + fsrm['C'][alist[i]][alist[t]][abs(j)])) + log(
                        (fsr['H'][alist[i]] + fsr['C'][alist[i]]) / fsr['E'][alist[i]])

        if coil == max(sheet, coil, helix):
            result.append('C')
        elif sheet == max(sheet, coil, helix):
            result.append('E')

        else:
            result.append('H')

    return ''.join(result)

## Validation procedure
- how to leave one out efficiently?
    - we simply use the (overall fsrm - the sequence fsrm)
        ```
    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm[i[1][j]][i[0][j]][i[0][k + j]][k] -= 1
    for j in range(len(i[0])):
        fsr[i[1][j]][i[0][j]] -=1
        fs[i[1][j]] -=1
        fr[i[0][j]] -=1
       ```
    - after we run the fsrm for the specific sequence, we should restore the state
    
       ```
    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm[i[1][j]][i[0][j]][i[0][k + j]][k] += 1
    for j in range(len(i[0])):
        fsr[i[1][j]][i[0][j]] +=1
        fs[i[1][j]] +=1
        fr[i[0][j]] +=1
       ```
       
    - for convenience, we write the result into a dssp_predict file, you can simply open the txt file and see the result

In [7]:
f = open("./txtfile/dssp_predict.txt", 'w')

predict = ""
right = ""

for i in list2:
    
    # leave one out procedure
    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm[i[1][j]][i[0][j]][i[0][k + j]][k] -= 1
    for j in range(len(i[0])):
        fsr[i[1][j]][i[0][j]] -=1
        fs[i[1][j]] -=1
        fr[i[0][j]] -=1
        
    


    f.write(i[2] + '\n')
    f.write(i[0] + '\n')
    # write the right structure
    f.write(i[1] + '\n')
    right += i[1]

    # write the predict structure
    temp = gor3(i[0])
    predict += temp
    f.write(temp + '\n')

    # restore

    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm[i[1][j]][i[0][j]][i[0][k + j]][k] += 1
    for j in range(len(i[0])):
        fsr[i[1][j]][i[0][j]] +=1
        fs[i[1][j]] += 1
        fr[i[0][j]] += 1


f.close()

enum = 0
hnum = 0
cnum = 0
for i in right:
    if i == 'C':
        cnum += 1
    elif i == 'E':
        enum += 1
    else:
        hnum += 1

ewrong = 0
hwrong = 0
cwrong = 0
num = 0
for i in range(len(predict)):
    if predict[i] == right[i]:
        num += 1
    elif right[i] == 'E':
        ewrong += 1
    elif right[i] == 'C':
        cwrong += 1
    else:
        hwrong += 1

print('the q3 score:',num / len(predict))


def mcc(predict, right, stru):
    TP, FP, FN, TN = 0.0001, 0.0001, 0.0001, 0.0001
    for i in range(len(predict)):
        if predict[i] == stru:
            if right[i] == stru:
                TP += 1
            else:
                FP += 1
        else:
            if right[i] == stru:
                FN += 1
            else:
                TN += 1
    MCC = (TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    TRP = TP / (TP + FN)
    SPC = TN / (FP + TN)

    return MCC
print("MCC SCORE FOR 'H' ",mcc(predict,right,'H'))
print("MCC SCORE FOR 'E' ",mcc(predict,right,'E'))
print("MCC SCORE FOR 'C' ",mcc(predict,right,'C'))

the q3 score: 0.5775657996105952
MCC SCORE FOR 'H'  0.389297018193
MCC SCORE FOR 'E'  0.318795051232
MCC SCORE FOR 'C'  0.37510051448


##  do gor3 for stride_info.txt

In [8]:
f = open("./txtfile/stride_protein")

list = f.readlines()
list2 = []
for i in range(0, len(list), 3):
    temp = []
    temp.append(list[i + 1].strip())
    temp.append(list[i + 2].strip())
    temp.append(list[i].strip())
    list2.append(temp)
f.close()
# print(list2)
# dealing the statistics
fsrm2 = {}
# initialize
amino = []
for i in amino_dict.keys():
    amino.append(amino_dict[i])
amino.insert(0,"?")
# print(amino)
for i in ['C', 'E', 'H']:
    temp3 = {}
    for j in amino:
        temp2 = {}
        for k in amino:
            temp = {}
            for n in range(1, 9):
                temp[n] = 0.001

            temp2[k] = temp

        temp3[j] = temp2
    fsrm2[i] = temp3

for i in list2:
    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm2[i[1][j]][i[0][j]][i[0][k + j]][k] += 1

In [9]:
def gor32(alist):
    result = []
    for i in range(len(alist)):

        helix = log(fsr2['H'][alist[i]] / (fsr2['C'][alist[i]] + fsr2['E'][alist[i]])) + log(
            (fs2['E'] + fs2['C']) / fs2['H'])
        coil = log(fsr2['C'][alist[i]] / (fsr2['H'][alist[i]] + fsr2['E'][alist[i]])) + log(
            (fs2['H'] + fs2['E']) / fs2['C'])
        sheet = log(fsr2['E'][alist[i]] / (fsr2['C'][alist[i]] + fsr2['H'][alist[i]])) + log(
            (fs2['H'] + fs2['C']) / fs2['E'])

        for j in range(-8, 9):
            t = i + j
            if t > 0 and t < len(alist):
                if j != 0:
                    helix += log(fsrm2['H'][alist[i]][alist[t]][abs(j)] / (
                        fsrm2['E'][alist[i]][alist[t]][abs(j)] + fsrm2['C'][alist[i]][alist[t]][abs(j)])) + log(
                        (fsr2['E'][alist[i]] + fsr2['C'][alist[i]]) / fsr2['H'][alist[i]])

                    coil += log(fsrm2['C'][alist[i]][alist[t]][abs(j)] / (
                        fsrm2['E'][alist[i]][alist[t]][abs(j)] + fsrm2['H'][alist[i]][alist[t]][abs(j)])) + log(
                        (fsr2['E'][alist[i]] + fsr2['H'][alist[i]]) / fsr2['C'][alist[i]])
                    sheet += log(fsrm2['E'][alist[i]][alist[t]][abs(j)] / (
                        fsrm2['H'][alist[i]][alist[t]][abs(j)] + fsrm2['C'][alist[i]][alist[t]][abs(j)])) + log(
                        (fsr2['H'][alist[i]] + fsr2['C'][alist[i]]) / fsr2['E'][alist[i]])

        if coil == max(sheet, coil, helix):
            result.append('C')
        elif sheet == max(sheet, coil, helix):
            result.append('E')

        else:
            result.append('H')

    return ''.join(result)

In [10]:
f = open("./txtfile/stride_predict.txt", 'w')

predict = ""
right = ""

for i in list2:
    # leave one out procedure

    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm2[i[1][j]][i[0][j]][i[0][k + j]][k] -= 1
    for j in range(len(i[0])):
        fsr2[i[1][j]][i[0][j]] -=1
        fs2[i[1][j]] -=1
        fr2[i[0][j]] -=1


    f.write(i[2] + '\n')
    f.write(i[0] + '\n')
    # write the right structure
    f.write(i[1] + '\n')
    right += i[1]

    # write the predict structure
    temp = gor32(i[0])
    predict += temp
    f.write(temp + '\n')

    # restore

    for j in range(len(i[0]) - 8):
        for k in range(1, 9):
            fsrm2[i[1][j]][i[0][j]][i[0][k + j]][k] += 1
    for j in range(len(i[0])):
        fsr2[i[1][j]][i[0][j]] +=1
        fs2[i[1][j]] +=1
        fr2[i[0][j]] +=1


f.close()

enum = 0
hnum = 0
cnum = 0
for i in right:
    if i == 'C':
        cnum += 1
    elif i == 'E':
        enum += 1
    else:
        hnum += 1

ewrong = 0
hwrong = 0
cwrong = 0
num = 0
for i in range(len(predict)):
    if predict[i] == right[i]:
        num += 1
    elif right[i] == 'E':
        ewrong += 1
    elif right[i] == 'C':
        cwrong += 1
    else:
        hwrong += 1

print('the q3 score:',num / len(predict))
print("MCC SCORE FOR 'H' ",mcc(predict,right,'H'))
print("MCC SCORE FOR 'E' ",mcc(predict,right,'E'))
print("MCC SCORE FOR 'C' ",mcc(predict,right,'C'))

the q3 score: 0.5765015405827483
MCC SCORE FOR 'H'  0.391738962391
MCC SCORE FOR 'E'  0.323456770868
MCC SCORE FOR 'C'  0.367702355075


## Comparison between dssp_info.txt and stride_info.txt
- the q3 score of dssp(57.75%) is slightly higher than stride(57.65%)
- but the stride gives better prediction with MCC score for H(39.17%) and E(32.34%) than dssp(38.9%,31.8% respectively)
- dssp gives better prediction for 'E' with MCC score 37.5, while that in stride is 36.7%

# the family prediction
- score = (count('H')-count('E'))/(len(list)-count('C'))
- score< boarder1 family='B'    
  score>boarder2 family='A'    
  boarder1<score < boarder2 family ='AB'
- use the mean value as boarder1 and boarder2
   
     

In [11]:
def read_fasta(filename):
    f = open(filename)
    proteins = []
    d = f.read().split(">")

    # deal with the  "" before the first ""
    d.pop(0)
    for i in d:
        t = i.splitlines()
        # get rid of the line of the protein name
        t.pop(0)
        s = "".join(t)
        proteins.append(s)
    return proteins

In [12]:
from numpy import sqrt

f = open("./txtfile/cath_info.txt")

fw = open("./txtfile/cath.txt", 'w')

dict = {}
dict['Alpha'] = 'A'
dict['Beta'] = 'B'
dict["Alpha/beta"] = 'AB'
dict["None"] = 'N'

for i in f.readlines():
    t = i.strip().split()

    t[2] = dict[t[2]]

    fw.write(t[0] + ' ')
    fw.write(t[1] + ' ')
    fw.write(t[2] + '\n')

fw.close()
f.close()

f = open("./txtfile/cath.txt")
right = []
for i in f.readlines():
    t = []
    t.append(i.strip().split()[0])
    t.append(i.strip().split()[2])
    right.append(t)

f.close()

f = open("./txtfile/dssp_predict.txt")
list = f.readlines()
list2 = []
for i in range(0, len(list), 4):
    t = []
    t.append(list[i].strip().split()[0])
    t.append(list[i].strip().split()[2])
    t.append(list[i + 3].strip())
    t.append(list[i + 2].strip())
    t.append(list[i + 1].strip())
    list2.append(t)


# print(list2)

def family_predict(alist):
    cnum = 1
    enum = 1
    hnum = 1
    for i in alist:
        if i == 'C':
            cnum += 1
        elif i == 'E':
            enum += 1
        else:
            hnum += 1

    return (hnum-enum)/(len(alist)-cnum)


result = []
for i in list2:
    t = []
    t.append(family_predict(i[2]))
    t.append(i[0])
    t.append(i[2])
    t.append(i[3])
    t.append(i[4])
    result.append(t)

#print(result)


def q3(right, predict):
    num = 0
    for i in range(len(right)):
        if right[i] == predict[i]:
            num += 1
    return num / len(right)

# use the mean value as boarder
sumB = 0
sumA = 0
numB = 0
numA = 0
sumAB = 0
numAB = 0
for i in range(len(right)):

    if right[i][1] == 'A':
        sumA += result[i][0]
        numA += 1
    elif right[i][1] =='B':
        sumB += result[i][0]
        numB += 1
    else:
        sumAB += result[i][0]
        numAB += 1

temp = []
accuracy = 0

for i in range(len(right)):
    t = []
    t.append(result[i][0])
    t.append(right[i][1])

    if result[i][0] > sumA/numA:
        t.append('A')
    elif result[i][0] < sumB/numB:
        t.append('B')
    else:
        t.append('AB')

    t.append(right[i][0])

    t.append(result[i][2])
    t.append(result[i][3])
    t.append(result[i][4])
    t.append(q3(result[i][3], result[i][2]))
    t.append(mcc(result[i][3], result[i][2], 'H'))
    t.append(mcc(result[i][3], result[i][2], 'E'))
    t.append(mcc(result[i][3], result[i][2], 'C'))
    temp.append(t)

    if t[1] == t[2]:
        accuracy += 1


print("the family prediction accuracy = ", accuracy / len(right))

print()

#  write the result into a csv file
f = open("./txtfile/stru_family_predict.csv", 'w')
f.write('H-E,')
f.write('actual family,')
f.write('predict family,')
f.write('name,')
f.write('predict structure,')
f.write('actual structure,')
f.write("amino,")
f.write('q3,')
f.write('mccH,')
f.write('mccE,')
f.write('mccC,')
f.write('\n')

print("pdb ","predict structure                       ","family","q3   ",'mccH','mccE','mccC')
print()
for i in temp:
    f.write(str(i[0]) + ',')
    f.write(str(i[1]) + ',')
    f.write(str(i[2]) + ',')
    f.write(str(i[3]) + ',')
    f.write(str(i[4]) + ',')
    f.write(str(i[5]) + ',')
    f.write(str(i[6]) + ',')
    f.write(str(i[7]) + ',')
    f.write(str(i[8]) + ',')
    f.write(str(i[9]) + ',')
    f.write(str(i[10]) + '\n')
    print(str(i[3])+' ',str(i[4])+' ',str(i[2])+' ',str(i[7])[:5]+' ',str(i[8])[:5]+' ',str(i[9])[:5]+' ',str(i[10])[:5]+' ')

#print(temp)

the family prediction accuracy =  0.6847389558232931

pdb  predict structure                        family q3    mccH mccE mccC

1w0n  HCHHEHHECEEEHHECHEECCCCCCEEEEHCHEHHCECEEEECCCEEEEEECCHCCCCCEEEEEEEEEEEEEEEEECCCCCEEECEECHCECCCCHHEEEEEHCCCCCEECEEHHHEEEE  B  0.616  0.001  0.380  0.379 
2gpi  CHCCCHEEEHCHEHCECEHHHHHHHHHCCHHEEHEHEHHHHHHHHHHHHHHCHHHHHHHHHHHCHHHHHHHHHHHHHHCHHCHEEEEEHHE  A  0.538  0.392  0.362  0.322 
1vbw  EEHCCCCCCCCEECCCCHHHHHHHHHCCHCHEEEEEEECCCCCHCEECCEEEEEECEHCEECCCCCCC  B  0.720  0.665  0.580  0.474 
2odk  EEEEHHCCHHHHHHHHHHCHECCCCCEECHHCHHHHEEEEHCCHHHHHHHH  AB  0.607  0.372  0.467  0.228 
2zxy  HCCCHEEHHHHCCCCECECEEEHCCCHHHHHHHHHHHHHHHHHHHCCCCCHHECHCHHHHCHHCHHHHHHHCHHHHHHHHHHHEHH  A  0.593  0.375  0.001  0.360 
2pr7  CHCCEEEEEEEEECCCCHECHHHHHHHHHHHHCCHCEEEECCCCCCCCCCEHHEHECCCEHEEEEEEEEHCEHHHCHHHHHHHHHHHCCCECCCEEECCEEECHECHHEHEEEEEHEHCHCHEHEHEEEEHCCCCEH  AB  0.605  0.493  0.357  0.435 
2pyq  CECCHCHHHHHHHHHHEEEECCCCCHHHHEHHCHCECCEEECECHECEECCCCEEEHHEHHHHHHHHHCCHCCHHE

## Improvement with multiple sequence alignment

1.download the result of multiple sequence alignment
2.the procedure 
    - use the tostr() method to get rid of the '-' and return sequence
    - apply the gor3 algorithm
    - reverse() method is used to insert the gap for the gor3 predict result
    - for every position, adopt the most 

In [13]:
# GHR--G-D -> GHRGD
def tostr(alist):
    result = ''
    for i in alist:
        if i != '-':
            result += i
    return result


# GHRGD -->EEE--C-D
def reverse(alist1, alist):
    result = ''

    # index of alist
    j = 0
    for i in alist1:
        if i != '-':
            result += alist[j]
            j += 1
        else:
            result += '-'
    return result


astru_right="CCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCEEEEEEEECCCCCEEEEEEECCCCCCCCEEEEEECCCCCCHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHCEEEEECCCCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCHHHCCCCCCCCCCCECCCCCCCECCCCCCCCHHHHHHHHHHHHHCCEEEEEEEEECCCEEEECCCCCCCCCCCHHHHHHHHHHHHHHHHHHHCCCCEEEEHHHHCCCCCCCHHHHHHHCCCCEEEEEEECCCCCCHHHCCHHHHHHHHHHHHHHHHHHHHHHHHC"
bstru_right ='CCCCECECCCCCECECCCEEEEEECCHHHCCCEEEEEECCEEEEEEEEECCCCCCCCCCEEEEECCCCCCCCECECCCCEEEEECCCCCCCCCCECEECCCCECCECEEECCCCCCCCCCCHHHCEEEEECECCCCCCEEEEEECCCEEECEEECCCCCCCCEEECCCCECCEEEEEECCC'
cstru_right = 'CCCCCCCCCCCCCCCCCCCCHHHHHHHHHCHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHHHHHHCECCCCCCCCCCCHHHHHHHHHHHCCHHHHHHHHHHHHHCCCCCEEEEEEEECCCCEEEEEEEECCCECCCCCCEEEEEEECCHHHCHHHHCCCHHHHHHHHHHHECHHHHHHHHHHHCCC'
dstru_right = 'CCCCCECCCECCCECCCCCCCEEEEEEECCEEEEEEEHHHHHHHHHHHHHHHHHHHCCCCEECCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHEEECCCCEEEECCCCCHHHHHHHHCCCCCCHHHHHHHHHHHCCCCC'
estru_right ='CCCCCCCCCCCHHHCCCCHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHCHHHHCCCC'
fstru_right = 'CCCCCCCCCCCCCCCHHHHHHHHHHHHCCCCHHHHHHHCEECCCCEECCCCCCCEEEECCCCCHHHHCCCCCCCCCCCCCCCCC'
gstru_right = 'CCHHHHHHHCCCCCCCCCCCCCCCCCHHHHHHHCCCCHHHHHHHHCCCCCCCCCCCCHHHHCCCCCCCCCCCHHHHHHHHHHHHCC'

def q3(list1,list2):
    right = 0
    for i in range(len(list1)):
        if list1[i] == list2[i]:
            right+=1
    return right/len(list1)


def align_predict(filename):
    seq = read_fasta(filename)
    for i in range(len(seq[0])):
        if seq[0][i] != '-':
            start = i
            break

    for i in range(len(seq[0]) - 1, 0, -1):
        if seq[0][i] != '-':
            end = i + 1
            break

    result = []
    for i in range(len(seq)):
        t1 = seq[i][start:end]
        t2 = gor3(tostr(t1))
        result.append(reverse(t1, t2))
        
    stru_pred = ''
    for i in range(len(result[0])):
        if result[0][i] != '-':
            temp = [result[j][i] for j in range(len(result))]

            t1 = temp.count('E')
            t2 = temp.count('C')
            t3 = temp.count('H')

            if t1 == max(t1, t2, t3):
                stru_pred += 'E'
            elif t2 == max(t1, t2, t3):
                stru_pred += 'C'
            else:
                stru_pred += 'H'
    return stru_pred

In [14]:
t = align_predict("./fasta/a_align1.fasta")
print("the predict structure for A:")
print(t)
print("the right structure:")
print(astru_right)
print("the q3 score with multi-seq-align:",q3(t,astru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/a_align1.fasta")[0])),astru_right))

the predict structure for A:
CCCCCCEEEECEECHHHHEHHHHHHHHHCCCHEEEEEEECCCCCCEEEEEEEECCCCCCCEEHEECCCECHHHEHCHHHHCHHHHEHHCECCCCCEEHEHHCHHEHEEEECCCCCCEEEECCCCEHEEHCHCCCCCEEEEECCCCCCECCECCCCCCCCCCEEEECCCCCCCEEEEHEEEHEHECCCCHHHHEHHHHHHEEEEECCCECCCCCCCHHHHHHHHHHHHHHHHHEECEEEEECEEEEEEEECCCCCECEEEECCEHEEEEEEEHCHCCEEEEECCCEHHHCCHHHEECHHHHHHHHHHC
the right structure:
CCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCEEEEEEEECCCCCEEEEEEECCCCCCCCEEEEEECCCCCCHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHCEEEEECCCCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCHHHCCCCCCCCCCCECCCCCCCECCCCCCCCHHHHHHHHHHHHHCCEEEEEEEEECCCEEEECCCCCCCCCCCHHHHHHHHHHHHHHHHHHHCCCCEEEEHHHHCCCCCCCHHHHHHHCCCCEEEEEEECCCCCCHHHCCHHHHHHHHHHHHHHHHHHHHHHHHC
the q3 score with multi-seq-align: 0.6058631921824105
the q3 score without multi-seq-align: 0.498371335504886


In [15]:
t = align_predict("./fasta/b_align2.fasta")
print("the predict structure for B:")
print(t)
print("the right structure:")
print(bstru_right)
print("the q3 score with multi-seq-align:",q3(t,bstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/b_align2.fasta")[0])),bstru_right))

the predict structure for B:
CCCCCCECCCCEEEECCEEEEEECCCCCCCCCEECCCCCCCCCCEEEHHHCCHCCCCCCEEECCCCCCCCCEEEEEECCEEEECCHHCEHEECCEEEECCCHCCCCEEEEECCECCCCCCCCCCHEEHEHCCCCEHCCEEEEHHCCCCCEECECECCCCCHHHEECCCCCCEEEEHHHCCC
the right structure:
CCCCECECCCCCECECCCEEEEEECCHHHCCCEEEEEECCEEEEEEEEECCCCCCCCCCEEEEECCCCCCCCECECCCCEEEEECCCCCCCCCCECEECCCCECCECEEECCCCCCCCCCCHHHCEEEEECECCCCCCEEEEEECCCEEECEEECCCCCCCCEEECCCCECCEEEEEECCC
the q3 score with multi-seq-align: 0.6408839779005525
the q3 score without multi-seq-align: 0.569060773480663


In [16]:
t = align_predict("./fasta/c_align2.fasta")
print("the predict structure for C:")
print(t)
print("the right structure:")
print(cstru_right)
print("the q3 score with multi-seq-align:",q3(t,cstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/c_align2.fasta")[0])),cstru_right))

the predict structure for C:
HEEECCCCCCCCCHECCCHHCHHHHEHHHHHHHHHHHHHCHHHHHHHHHHHHHCHHHHHHHHHCHHEHEECCECHEEEHECECCCCCCCCCCCCHHHHHHHHHHHCHHHHHHHHHHHHHEHHCCCEEEEEHHCHCCHEEHEHHHHHCCCCCCCCCCEHCHCHHHHHHHHHHHCHEHHHEHHHEHHEEEHHHHHHHHHHHHH
the right structure:
CCCCCCCCCCCCCCCCCCCCHHHHHHHHHCHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHHHHHHCECCCCCCCCCCCHHHHHHHHHHHCCHHHHHHHHHHHHHCCCCCEEEEEEEECCCCEEEEEEEECCCECCCCCCEEEEEEECCHHHCHHHHCCCHHHHHHHHHHHECHHHHHHHHHHHCCC
the q3 score with multi-seq-align: 0.681592039800995
the q3 score without multi-seq-align: 0.6119402985074627


In [17]:
t = align_predict("./fasta/d_align.fasta")
print("the predict structure for D:")
print(t)
print("the right structure:")
print(dstru_right)
print("the q3 score with multi-seq-align:",q3(t,dstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/d_align.fasta")[0])),dstru_right))

the predict structure for D:
HHHHEHEHHHHCCCCHHECEHEEEEECCECCCCHHHHHHHHHHHHHHHCCCHHHHHHHCCHHHHHHEHHHHHHCHEHHHHHHHHHHCEEEEHHCCHHHHEHHHHHHHEEEHECHHHHHHHHHHHHHHHCHHHHCCCHEHEEHEHCHHEEHHECHCCCEEHEEEHHHHCHEEHHEC
the right structure:
CCCCCECCCECCCECCCCCCCEEEEEEECCEEEEEEEHHHHHHHHHHHHHHHHHHHCCCCEECCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHEEECCCCEEEECCCCCHHHHHHHHCCCCCCHHHHHHHHHHHCCCCC
the q3 score with multi-seq-align: 0.5028571428571429
the q3 score without multi-seq-align: 0.4857142857142857


In [18]:
t = align_predict("./fasta/e_align1.fasta")
print("the predict structur for E:")
print(t)
print("the right structure:")
print(estru_right)
print("the q3 score with multi-seq-align:",q3(t,estru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/e_align1.fasta")[0])),estru_right))

the predict structur for E:
CECCCCCCECCCCHEEHEHEHHHHCHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHEHHHHHHHEEEEHCEEEHCCH
the right structure:
CCCCCCCCCCCHHHCCCCHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHCHHHHCCCC
the q3 score with multi-seq-align: 0.7433628318584071
the q3 score without multi-seq-align: 0.6106194690265486


In [19]:
t = align_predict("./fasta/f_align.fasta")
print("the predict structure for F:")
print(t)
print("the right structure:")
print(estru_right)
print("the q3 score with multi-seq-align:",q3(t,fstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/f_align.fasta")[0])),estru_right))

the predict structure for F:
CCCCCCHECEECHHCEHHHHHHHHHHHHHHHHHHHHHHCECHECECCCCCHHHEHHHECCCCCCEEECCCCCCHCHEHCCCCCC
the right structure:
CCCCCCCCCCCHHHCCCCHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHCHHHHCCCC
the q3 score with multi-seq-align: 0.6190476190476191
the q3 score without multi-seq-align: 0.4880952380952381


# Analyse of the algorithm with multiple sequence align

- all the 6 sequences show around 10% improvements except for sequence D
- the reason for the low improvement of sequence D
![](https://ww4.sinaimg.cn/large/006tNc79gy1feriw1m7hej31ak0k6tdf.jpg)
### from the picture we can see that multiple sequences are have almost **the same amino acids sequence**, so they will give almost the same result in GOR III algorithm

### So it is important to select the sequence with the same structure but not the same amino acids sequences



## Explore the influence of the selection of sequences for multi-seq-alignment
- here I have 3 different multi-seq-alignment data sets for B

the first data set, with more similarities in amino acid sequences
![](https://ww2.sinaimg.cn/large/006tNc79gy1ferjxtddufj31kw0f50va.jpg)

In [23]:
t = align_predict("./fasta/b_align.fasta")

print("the q3 score with multi-seq-align:",q3(t,bstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/b_align.fasta")[0])),bstru_right))

the q3 score with multi-seq-align: 0.5911602209944752
the q3 score without multi-seq-align: 0.569060773480663


the second data set, with less similarities in amino acid sequences than first one
![](https://ww2.sinaimg.cn/large/006tNc79gy1ferjxwh9h6j31kw0bbwgo.jpg)

In [24]:
t = align_predict("./fasta/b_align1.fasta")
print("the q3 score with multi-seq-align:",q3(t,bstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/b_align1.fasta")[0])),bstru_right))

the q3 score with multi-seq-align: 0.6077348066298343
the q3 score without multi-seq-align: 0.569060773480663


the third data set, with less similarities in amino acid sequences than second one
![](https://ww3.sinaimg.cn/large/006tNc79gy1ferjy5ka5cj31kw0r643j.jpg)

In [25]:
t = align_predict("./fasta/b_align2.fasta")
print("the q3 score with multi-seq-align:",q3(t,bstru_right))
print("the q3 score without multi-seq-align:",q3(gor3(tostr(read_fasta("./fasta/b_align2.fasta")[0])),bstru_right))

the q3 score with multi-seq-align: 0.6408839779005525
the q3 score without multi-seq-align: 0.569060773480663


 ## Conclusion: so we should carefully choose the sequence with more similar in structure but less similar in amino acid sequence to get better performance
 