In [2]:
import pandas as pd
import numpy as np

In [3]:
import json
with open('final_classification.json', 'r') as f:
    data = json.load(f)

In [4]:
def makequery(query,name="sequence"):
    with open('query.fasta', 'w') as f:
        f.write(">{}\n".format(name))
        f.write("{}".format(query))
def align_score(seq1, seq2):
    
    match = 1
    mismatch = -1
    gap = -1
    
    
    n = len(seq1)
    m = len(seq2)
    score = [[0] * (m + 1) for _ in range(n + 1)]
    for i in range(n + 1):
        score[i][0] = i * gap
    for j in range(m + 1):
        score[0][j] = j * gap
        
    
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            diag = score[i-1][j-1] + (match if seq1[i-1] == seq2[j-1] else mismatch)
            up = score[i-1][j] + gap
            left = score[i][j-1] + gap
            score[i][j] = max(diag, up, left)
    return score[n][m]

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_score=-1, gap_penalty=-1):
    
    rows = len(seq1) + 1
    cols = len(seq2) + 1
    score = [[0] * cols for _ in range(rows)]

    
    for i in range(rows):
        score[i][0] = i * gap_penalty
    for j in range(cols):
        score[0][j] = j * gap_penalty

    
    for i in range(1, rows):
        for j in range(1, cols):
            match = score[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_score)
            delete = score[i - 1][j] + gap_penalty
            insert = score[i][j - 1] + gap_penalty
            score[i][j] = max(match, delete, insert)
    alignment_score = score[rows - 1][cols - 1]

    return alignment_score


def match(a, b):
    if(a == b):
        return 1

    else:
        return -1
def scoring_matrix(b, a):
    z = [[0 for i in range(len(b))]
         for j in range(len(a))]  # Swap len(a) and len(b)
    for i in range(len(a)):
        for j in range(len(b)):
            if i == 0 and j == 0:
                z[i][j] = 0
            elif i == 0:
                z[i][j] = z[i][j-1] - 2
            elif j == 0:
                z[i][j] = z[i-1][j] - 2
            else:
                # Fix this line)
                z[i][j] = max((z[i-1][j] - 2, z[i][j-1] - 2,
                              z[i-1][j-1] + match(a[i], b[j])))
    return z
        

In [5]:
def printMatrix(mat):
    for i in range(len(mat)):
        for j in range(len(mat[i])):
            print(mat[i][j], end=" ")
        print()

def constructMatrix(str1, str2):
    list = [[0 for i in range(len(str1)+1)] for j in range(len(str2)+1)]
    return list

def printStringMatrix(matrix, str1, str2):
    print("", end = "  ")
    print("_", end = " ")
    for i in range(len(str1)):
        print(str1[i], end = " ")
    print()
    for i in range(len(matrix)):
        if i == 0 :
            print("_", end = " ")
        else:
            print(str2[i-1], end = " ")
        for j in range(len(matrix[i])):
            print(matrix[i][j], end = " ")
        print()

def initMatrix(matrix, gapPenalty):
    x=0
    for i in range(len(matrix)):
        matrix[i][0] = x * gapPenalty
        x=x+1
    x=0
    for i in range(len(matrix[0])):
        matrix[0][i] = x * gapPenalty
        x=x+1
    return matrix

def getMinPenalty(matrix, i, j, str1, str2,matchPenalty, mismatchPenalty, gapPenalty ):
    if str2[i-1] == str1[j-1]:
        currPenalty = matchPenalty
    else:
        currPenalty = mismatchPenalty

    top = matrix[i-1][j] + gapPenalty
    left = matrix[i][j-1] + gapPenalty
    diag = matrix[i-1][j-1] + currPenalty
    maxE = max(top, left, diag)
    currPenalty = maxE

    return currPenalty

def fillMatrix(matrix, str1, str2, matchPenalty, mismatchPenalty, gapPenalty):
    for i in range(1, len(matrix)):
        for j in range(1, len(matrix[0])):
            matrix[i][j] = getMinPenalty(matrix, i, j, str1, str2, matchPenalty, mismatchPenalty, gapPenalty)


def getPrevious(matrix, i, j,matchPenalty, mismatchPenalty, gapPenalty ):

    score = matrix[i][j]
    top = matrix[i-1][j]
    left = matrix[i][j-1]
    diag = matrix[i-1][j-1]

    maxPrevIndexI = 0
    maxPrevIndexJ = 0
    maxPrevDir = 0
    if diag + matchPenalty == score or diag + mismatchPenalty == score:
        maxPrevIndexI = i-1
        maxPrevIndexJ = j-1
        maxPrevDir = 0
    elif top + gapPenalty == score:
        maxPrevIndexI = i-1
        maxPrevIndexJ = j
        maxPrevDir = 1
    elif left + gapPenalty == score:
        maxPrevIndexI = i
        maxPrevIndexJ = j-1
        maxPrevDir = 2

    return maxPrevDir, maxPrevIndexI, maxPrevIndexJ


def backTrack(matrix, matchPenalty, mismatchPenalty, gapPenalty):

    score = 0
    directions = []

    i = len(matrix)-1
    j = len(matrix[0])-1

    while i>=0 and j>=0:
        score += matrix[i][j]
        if i == 0 or j == 0:
            break
        prev = getPrevious(matrix, i, j, matchPenalty, mismatchPenalty, gapPenalty)
        directions.insert(0, prev[0])
        i = prev[1]
        j = prev[2]
    return directions, score

def alignSequences(sequence1, sequence2, matchPenalty = 1, mismatchPenalty = -1, gapPenalty = -2):
    mat = constructMatrix(sequence1, sequence2)
    matrix = initMatrix(mat, gapPenalty)
    fillMatrix(matrix, sequence1, sequence2, matchPenalty, mismatchPenalty, gapPenalty)
    dir = backTrack(matrix, matchPenalty, mismatchPenalty, gapPenalty)
    return dir[1]
def find_score(row,name):
    str1=row['Sequences']
    str2=name
    score=0
    for i in range(len(str1)):
        if(str1[i]!=str2[i]):
            score+=1
    return score

In [6]:
name=input("Please enter the name")
query=input()
# makequery(query,name)

In [7]:
query

'AACCTGCATTGCGATACGGATAG'

In [8]:
df=pd.read_csv("merged_final24.csv")


In [9]:
df["score"]=0

In [10]:
for i in range(len(df["Sequence"])):
    score=align_score(query,df["Sequence"][i])
    df["score"][i]=score

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["score"][i]=score


In [11]:
df

Unnamed: 0,Sequence,Subpopulation,Cultivar ID,Plant Height (cm),score
0,GAAAGCCGTATGTGTACCAACCTT,Temperate Japonica,C152,100.00,0
1,GGGAGCGACAGGTATGTTACCTTT,Indica I,W146,127.40,-5
2,GGGAGCGGTAGTAATGTTACCTTT,Indica II,W150,104.60,-6
3,GGGGGCCACCGTAGTACCACCCTC,VI/Aromatic,W129,151.20,-7
4,GGGGGCGGTATGTATGTTACCTTT,Indica II,W086,103.40,-6
...,...,...,...,...,...
524,GGGAACGGTATTAATGTTACCTCT,Indica I,C159,95.80,-3
525,GGGAGCGGTATGTATGTTACCTTT,Indica Intermediate,W167,135.20,-4
526,GAGAGCCATAGGTATACCAACCTC,Japonica Intermediate,W324,156.00,0
527,GGGGACCACCGTAATGTTTCCTTC,Indica Intermediate,C198,173.77,-7


In [12]:
sorted_df = df.sort_values(by='score')
sorted_df = df.sort_values(by='score', ascending=False)
sorted_df

Unnamed: 0,Sequence,Subpopulation,Cultivar ID,Plant Height (cm),score
66,AAGGGCCATAGGTATACCAATCTC,Japonica Intermediate,W122,169.80,2
106,AAGGGCCATAGGAGTACCACTCTC,Temperate Japonica,C074,143.65,1
246,GAAAGCCATAGGTATACCAATCTC,Japonica Intermediate,W021,142.40,1
175,GAAAGCCATAGGTATACCAATCTC,Japonica Intermediate,C185,145.00,1
267,AAGAGCCATAGGAGTACTACTCTC,Temperate Japonica,C138,190.90,1
...,...,...,...,...,...
8,GGGGGCCACCGTAATGTCTCCCCC,VI/Aromatic,W077,182.40,-10
423,GGGGGCCACCGTAATGTCTCCCCC,VI/Aromatic,W220,172.00,-10
258,GGGGGCCACAGTAATGTCTCCCCC,Aus,W277,142.00,-10
41,GGGGGCCACCGTAATGTCTCCCCC,Aus,W330,149.60,-10


In [13]:
df=df.head(5)
df

Unnamed: 0,Sequence,Subpopulation,Cultivar ID,Plant Height (cm),score
0,GAAAGCCGTATGTGTACCAACCTT,Temperate Japonica,C152,100.0,0
1,GGGAGCGACAGGTATGTTACCTTT,Indica I,W146,127.4,-5
2,GGGAGCGGTAGTAATGTTACCTTT,Indica II,W150,104.6,-6
3,GGGGGCCACCGTAGTACCACCCTC,VI/Aromatic,W129,151.2,-7
4,GGGGGCGGTATGTATGTTACCTTT,Indica II,W086,103.4,-6


In [14]:

with open('sequence_cluster_dict.json', 'r') as f:
    data1 = json.load(f)
data1

{'C051': 0,
 'C119': 0,
 'W158': 0,
 'C014': 1,
 'C152': 1,
 'C029': 1,
 'C010': 2,
 'W062': 3,
 'W041': 3,
 'W294': 4,
 'W118': 5,
 'W225': 5,
 'W128': 6,
 'W119': 7,
 'W193': 8,
 'W249': 9,
 'W250': 9,
 'W060': 9,
 'W063': 10,
 'W010': 10,
 'W061': 10,
 'W226': 10,
 'C035': 11,
 'C057': 12,
 'C177': 12,
 'C017': 12,
 'W045': 13,
 'C023': 13,
 'W323': 14,
 'W003': 14,
 'C186': 15,
 'C185': 16,
 'W021': 16,
 'W127': 17,
 'W326': 17,
 'W324': 17,
 'W328': 18,
 'W013': 18,
 'W162': 19,
 'C150': 20,
 'W117': 20,
 'C146': 20,
 'W016': 20,
 'W283': 20,
 'W130': 20,
 'C018': 20,
 'C028': 20,
 'C116': 20,
 'W088': 20,
 'W305': 20,
 'W012': 20,
 'C026': 20,
 'C154': 20,
 'C052': 20,
 'C084': 21,
 'C085': 21,
 'C056': 22,
 'C016': 22,
 'W019': 23,
 'W018': 24,
 'W139': 24,
 'W064': 24,
 'C070': 25,
 'W011': 25,
 'C063': 26,
 'C172': 27,
 'C149': 27,
 'C120': 27,
 'C144': 28,
 'C134': 28,
 'C083': 29,
 'W319': 30,
 'C079': 31,
 'C133': 31,
 'C103': 32,
 'C196': 32,
 'C101': 33,
 'C106': 33,
 'W2

In [15]:
df["Cluster"]=0
for i in range(len(df["Sequence"])):
    df["Cluster"][i]=data1[df["Cultivar ID"][i]]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Cluster"]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Cluster"][i]=data1[df["Cultivar ID"][i]]


Unnamed: 0,Sequence,Subpopulation,Cultivar ID,Plant Height (cm),score,Cluster
0,GAAAGCCGTATGTGTACCAACCTT,Temperate Japonica,C152,100.0,0,1
1,GGGAGCGACAGGTATGTTACCTTT,Indica I,W146,127.4,-5,163
2,GGGAGCGGTAGTAATGTTACCTTT,Indica II,W150,104.6,-6,111
3,GGGGGCCACCGTAGTACCACCCTC,VI/Aromatic,W129,151.2,-7,127
4,GGGGGCGGTATGTATGTTACCTTT,Indica II,W086,103.4,-6,90


In [16]:
x=dict()
for i in set(df["Subpopulation"]):
    x[i]=list(df["Subpopulation"]).count(i)
x = dict(sorted(x.items(), key=lambda item: item[1], reverse=True))
x=list(x.keys())
x

['Indica II', 'Temperate Japonica', 'Indica I', 'VI/Aromatic']

In [17]:
# y=dict()
# for i in set(df["Cluster"]):
#     y[i]=(list(df["Cluster"]).count(i),df["score"][i])
# y = dict(sorted(x.items(), key=lambda item: (item[1][0], item[1][1])))
# y=list(y.keys())
# y

y = dict()
for i in set(df["Cluster"]):
    if i in df["score"]:
        y[i] = (list(df["Cluster"]).count(i), df["score"][i])
    else:
        y[i] = (list(df["Cluster"]).count(i), 0)
y = dict(sorted(y.items(), key=lambda item: (item[1][0], item[1][1])))
y = list(y.keys())
y=y[0]
y

1

In [27]:
with open('clusters_dict.json', 'r') as f:
    data2 = json.load(f)

In [25]:
with open('final_classification.json', 'r') as f:
    data3= json.load(f)

In [33]:
z=data2[str(y)][0]
pp=list(data3[z]['Subpopulation'].keys())
pp


['Japonica Intermediate', 'Temperate Japonica']

In [37]:
for i in x:
    if(i in pp):
        x=i
x

'Temperate Japonica'

In [38]:
data3[z]["Subpopulation"][x]

'100.0-100.0'

In [30]:
z=data2[str(y)][0]
z
data[z]["Subpopulation"][str(x)]

KeyError: 'Indica II'

In [64]:
maxscore=sorted(set(df["score"]))[-1]
df=df[df["score"]==maxscore]
df

Unnamed: 0,Sequence,Subpopulation,Cultivar ID,Plant Height (cm),score
0,GAAAGCCGTATGTGTACCAACCTT,Temperate Japonica,C152,100.00,-8
28,GAAAGCCATAGGTGTACCAACCTT,Temperate Japonica,W305,74.00,-8
35,GAAAGCCATAGGTGTACCAACCTT,Temperate Japonica,C018,78.27,-8
38,GAAAGCCATAGGTGTGCCAACCTT,Japonica Intermediate,W061,121.80,-8
52,GAAAGCCATAGGTGTACCAACCTT,Temperate Japonica,C150,90.32,-8
...,...,...,...,...,...
502,GAAAGCCATAGGAGTACCAACCTT,Temperate Japonica,W162,98.40,-8
504,GAAAGCCATAGGTGTGTTACTCTT,Temperate Japonica,W207,114.60,-8
509,GAAAGCCATAGGTGTGTTAACCTT,Temperate Japonica,C048,97.33,-8
519,GAAAGCCATAGGTGAACCAACCCT,Temperate Japonica,C149,102.47,-8


In [65]:
matchseq=list(df["Cultivar ID"])[0]
matchseq

'C152'

In [1]:
data["C138"]["height_range"]

NameError: name 'data' is not defined