In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

import torch.nn.init as init

from torch.autograd import Variable
import itertools
import matplotlib.pyplot as plt
import pickle
device = 'cpu' #if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
g = torch.Generator().manual_seed(21456789)

Using device: cpu


In [3]:
df_site =  pd.read_csv('SITE-Seq_offTarget_wholeDataset.csv') # change the path to run on colab
df_klein = pd.read_csv('Kleinstiver_5gRNA_wholeDataset.csv')
df_circle = pd.read_excel('CIRCLE-seq_6gRNA_active_offTargets.xlsx')

df_site = df_site[213966:].reset_index()
df_klein = df_klein[95777:].reset_index()

df = pd.DataFrame()
df['TargetSequence'] = list(df_site['on_seq']) + list(df_klein['sgRNA_seq']) + list(df_circle['TargetSequence'])
df['Off-target Sequence'] = list(df_site['off_seq']) + list(df_klein['off_seq']) + list(df_circle['Off-target Sequence'])
df_test = pd.DataFrame()
df_test = df[df['TargetSequence'].isin(['GGCGGCTGCACAACCAGTGGNGG','GCATACAGTGATTTGATGAANGG '])]
df =df[~df['TargetSequence'].isin(['GGCGGCTGCACAACCAGTGGNGG','GCATACAGTGATTTGATGAANGG'])]

In [4]:
df['TargetSequence'].unique()

array(['GGGGCCACTAGGGACAGGATNGG', 'GGGTGGGGGGAGTTTGCTCCNGG',
       'GCAAAACTCAACCCTACCCCNGG', 'CTCGTCTGATAAGACAACAGNGG',
       'GGAATCCCTTCTGCAGCACCNGG', 'ATAGGAGAAGATGATGTATANGG',
       'GCTGATGTAGTCACTCTTGANGG', 'GGTGGACAAGCGGCAGATAGNGG',
       'GTCACCTCCAATGACTAGGGNGG', 'GCTGCAGAAGGGATTCCATGNGG',
       'GCATTTTCAGGAGGAAGCGANGG', 'GTGCGGCAAGAGCTTCAGCCNGG',
       'GGGAAAGACCCAGCATCCGTNGG', 'GAACACAAAGCATAGACTGCNGG',
       'GGCCCAGACTGAGCACGTGANGG', 'GGCACTGCGGCTGGAGGTGGNGG',
       'GAGTCCGAGCAGAAGAAGAANGG', 'GTTGCCCCACAGGGCAGTAANGG',
       'GTCATCTTAGTCATTACCTGNGG', 'GACCCCCTCCACCCCGCCTCNGG',
       'GGTGAGTGAGTGTGTGCGTGNGG'], dtype=object)

In [5]:
base_to_index = {'A': 0, 'T': 1, 'G': 2, 'C': 3, 'N': 4, '-': 5}

In [6]:
df = df[df['Off-target Sequence'].apply(len) == 23]
base_to_index = {'A': 0, 'T': 1, 'G': 2, 'C': 3, 'N': 4, '-': 5}
vocab_size = len(base_to_index)

In [7]:
df['TargetSequence'] = df['TargetSequence'].apply(lambda seq: [base_to_index[base] for base in seq])
df['Off-target Sequence'] = df['Off-target Sequence'].apply(lambda seq: [base_to_index[base] for base in seq])

In [8]:
y = np.array(df['Off-target Sequence'].tolist())
x = np.array(df['TargetSequence'].tolist())

In [9]:
x = torch.tensor(x,device=device)
y = torch.tensor(y,device=device)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.001, random_state=42)

In [None]:
# Pytorch for the same model 
# Model is trained in 3 phases, phase 1 invloves fitting the model on all of the base pairs in the input sequence, once that a windowed appraoch is applied i.e, the input
# sequencce is essentially passed through the same model but loss 

# define embedding dim of 35 
E_M = torch.randn((6,35), generator=g).to(device) 
D5 = torch.randn((23*35,1028), generator=g).to(device) #* (1.414) /((23*35)**0.5)
B7 = torch.randn(1028, generator=g).to(device)* 0.01
D1 = torch.randn((1028,512), generator=g).to(device)#* (1.414) /((1028)**0.5)
B1 = torch.randn(512, generator=g).to(device)* 0.01
D6 = torch.randn((512,512), generator=g).to(device)#* (1.414) /((512)**0.5)
B6 = torch.randn(512, generator=g).to(device)* 0.01
D2 = torch.randn((512,256), generator=g).to(device)#* (1.414) /((512)**0.5)
B2 = torch.randn(256, generator=g).to(device)* 0.01
D3 = torch.randn((256,128), generator=g).to(device)#* (1.414) /((256)**0.5)
B3 = torch.randn(128, generator=g).to(device)* 0.01
D4 = torch.randn((128,64), generator=g).to(device)#* (1.414) /((128)**0.5)
B4 = torch.randn(64, generator=g).to(device)#* 0.01
OUT = torch.rand((64,6*23), generator=g).to(device)#* (1.414) /((64)**0.5)
B5 = torch.randn(6 * 23, generator=g).to(device) * 0.01
params = [E_M,D1,D2,D3,D4,B1,B2,B3,B4,D5,B5,OUT,D6,B6]
weights = [D5,D1,D2,D3,D4,OUT]
init.xavier_uniform_(D1)
init.xavier_uniform_(D2)
init.xavier_uniform_(D3)
init.xavier_uniform_(D4)
init.xavier_uniform_(OUT)

# Initialize biases to zero
B1.data.fill_(0)
B2.data.fill_(0)
B3.data.fill_(0)
B4.data.fill_(0)
B5.data.fill_(0)
multihead_attn1 = torch.nn.MultiheadAttention(embed_dim=35, num_heads=5).to(device)
multihead_attn2 = torch.nn.MultiheadAttention(embed_dim=35, num_heads=5).to(device)
multihead_attn3 = torch.nn.MultiheadAttention(embed_dim=35, num_heads=5).to(device)


print(f"Numner of trainable params  = {sum(p.nelement() for p in params)}")

Numner of trainable params  = 1798704


In [12]:
multihead_attn1 = torch.nn.MultiheadAttention(embed_dim=35, num_heads=5).to(device)
multihead_attn2 = torch.nn.MultiheadAttention(embed_dim=35, num_heads=5).to(device)
multihead_attn3 = torch.nn.MultiheadAttention(embed_dim=35, num_heads=5).to(device)

In [None]:
""" To load a pretrained model"""

with open('fuck_you.pkl', 'rb') as f:
    E_M,D1,D2,D3,D4,B1,B2,B3,B4,D5,B5,OUT,D6,B6 = pickle.load(f)

In [None]:
for p in params:
    p.requires_grad = True

optimizer = torch.optim.Adam(params=params,weight_decay=1e-3, lr=0.001)    

In [None]:
""" Train  to capture the pattern in Off_target"""

for _ in range(300):

    optimizer.zero_grad()
    
    ix = torch.randint(0,len(X_train),(16,)).to(device)
    in_seq = E_M[x[ix]].to(device)
    
    attn_output1= multihead_attn1.forward(in_seq, in_seq, in_seq,need_weights=False)[0]
    attn_output1.retain_grad()
    res_out_normalised1 = torch.nn.functional.normalize((attn_output1 + in_seq),dim=1)

    attn_output2= multihead_attn2.forward(res_out_normalised1, res_out_normalised1, res_out_normalised1,need_weights=False)[0]
    attn_output2.retain_grad()
    res_out_normalised2 = torch.nn.functional.normalize((attn_output2 + res_out_normalised1),dim=1)

    attn_output3= multihead_attn3.forward(res_out_normalised2, res_out_normalised2, res_out_normalised2,need_weights=False)[0]
    attn_output3.retain_grad()
    res_out_normalised3 = torch.nn.functional.normalize((attn_output3 + res_out_normalised2),dim=1)
    
    D0_out = torch.relu(res_out_normalised3.view(len(in_seq),23*35) @ D5)
    
    D1_out = torch.relu(D0_out @ D1 + B1)
    
    D6_out  = torch.relu(D1_out @ D6 + B6)
    # D1_out = torch.relu(D6_out @ D1 + B1)

    D2_out = torch.relu(D6_out @ D2 + B2)

    D3_out = torch.relu(D2_out @ D3 + B3)

    D4_out = torch.relu(D3_out @ D4 + B4)

    OUT_OUT = torch.relu(D4_out @ OUT + B5)

    loss = 0
    for i, ix_i in zip(range(16),ix):
        # loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(y[ix_i]).type(torch.LongTensor).cuda()).to(device)
        loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(y[ix_i]).type(torch.LongTensor)).to(device)
    loss = loss/16
    wt_loss  = 0
    for w  in weights:
        wt_loss+= 0.001*(w**2).mean()
        
    loss += (wt_loss/len(weights))
    print(f"Epoc = {_}, Loss = {loss}")

    loss.backward(retain_graph=True)

    optimizer.step()

  loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(y[ix_i]).type(torch.LongTensor)).to(device)


Epoc = 0, Loss = 21.11644172668457
Epoc = 1, Loss = 11.182271003723145
Epoc = 2, Loss = 7.477406978607178
Epoc = 3, Loss = 6.500844955444336
Epoc = 4, Loss = 5.352558135986328
Epoc = 5, Loss = 4.381619453430176
Epoc = 6, Loss = 3.4195780754089355
Epoc = 7, Loss = 2.61513614654541
Epoc = 8, Loss = 2.5952553749084473
Epoc = 9, Loss = 2.5502679347991943
Epoc = 10, Loss = 2.808638572692871
Epoc = 11, Loss = 2.530743360519409
Epoc = 12, Loss = 2.3258755207061768
Epoc = 13, Loss = 2.15358829498291
Epoc = 14, Loss = 1.979556918144226
Epoc = 15, Loss = 1.9859719276428223
Epoc = 16, Loss = 1.890526294708252
Epoc = 17, Loss = 1.9231268167495728
Epoc = 18, Loss = 1.849387288093567
Epoc = 19, Loss = 1.7665702104568481
Epoc = 20, Loss = 1.7233998775482178
Epoc = 21, Loss = 1.781467318534851
Epoc = 22, Loss = 1.7819347381591797
Epoc = 23, Loss = 1.8414045572280884
Epoc = 24, Loss = 1.7923998832702637
Epoc = 25, Loss = 1.7236906290054321
Epoc = 26, Loss = 1.736307144165039
Epoc = 27, Loss = 1.7515237

In [None]:
""" Windowed Approach on Off-Target"""

for k in range(5,2,-1):
    for j in range(5,19-k):
        for _ in range(200):

            optimizer.zero_grad()
            
            ix = torch.randint(0,len(X_train),(16,)).to(device)
            in_seq = E_M[x[ix]]  
            attn_output1 = multihead_attn1.forward(in_seq, in_seq, in_seq,need_weights=False)[0]
            attn_output1.retain_grad()
            res_out_normalised1 = torch.nn.functional.normalize((attn_output1 + in_seq),dim=1)

            attn_output2 = multihead_attn2.forward(res_out_normalised1, res_out_normalised1, res_out_normalised1,need_weights=False)[0]
            attn_output2.retain_grad()
            res_out_normalised2 = torch.nn.functional.normalize((attn_output2 + res_out_normalised1),dim=1)

            attn_output3 = multihead_attn3.forward(res_out_normalised2, res_out_normalised2, res_out_normalised2,need_weights=False)[0]
            attn_output3.retain_grad()
            res_out_normalised3 = torch.nn.functional.normalize((attn_output3 + res_out_normalised2),dim=1)
            
            D0_out = torch.relu(res_out_normalised3.view(len(in_seq),23*35) @ D5)
            
            D1_out = torch.relu(D0_out @ D1 + B1)
            
            D6_out  = torch.relu(D1_out @ D6 + B6)
        
            D2_out = torch.relu(D6_out @ D2 + B2)

            D3_out = torch.relu(D2_out @ D3 + B3)

            D4_out = torch.relu(D3_out @ D4 + B4)

            OUT_OUT = torch.relu(D4_out @ OUT + B5)

            loss = 0
            for i, ix_i in zip(range(16),ix):
                #loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i][j:j+k],torch.tensor(y[ix_i][j:j+k]).type(torch.LongTensor).cuda()).to(device)
                loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i][j:j+k],torch.tensor(y[ix_i][j:j+k]).type(torch.LongTensor)).to(device)
            loss = loss/16
            wt_loss  = 0
            for w  in weights:
                wt_loss+= 0.001*(w**2).mean()
                
            loss += (wt_loss/len(weights))
            print(f"Epoc = {_}, Loss = {loss}")

            loss.backward(retain_graph=True)


            optimizer.step()       

  loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i][j:j+k],torch.tensor(y[ix_i][j:j+k]).type(torch.LongTensor)).to(device)


Epoc = 0, Loss = 1.284532904624939
Epoc = 1, Loss = 1.2636581659317017
Epoc = 2, Loss = 1.2400529384613037
Epoc = 3, Loss = 1.1459851264953613
Epoc = 4, Loss = 1.0776662826538086
Epoc = 5, Loss = 1.0895296335220337
Epoc = 6, Loss = 1.177463173866272
Epoc = 7, Loss = 0.8122770190238953
Epoc = 8, Loss = 1.0265328884124756
Epoc = 9, Loss = 1.1309641599655151
Epoc = 10, Loss = 0.91237872838974
Epoc = 11, Loss = 1.2550652027130127
Epoc = 12, Loss = 1.027401328086853
Epoc = 13, Loss = 0.9937960505485535
Epoc = 14, Loss = 1.322831153869629
Epoc = 15, Loss = 1.2674014568328857
Epoc = 16, Loss = 1.2631689310073853
Epoc = 17, Loss = 1.191552758216858
Epoc = 18, Loss = 1.133197546005249
Epoc = 19, Loss = 1.272927165031433
Epoc = 20, Loss = 1.0299770832061768
Epoc = 21, Loss = 0.9762030243873596
Epoc = 22, Loss = 1.2669144868850708
Epoc = 23, Loss = 0.9863195419311523
Epoc = 24, Loss = 0.9856951236724854
Epoc = 25, Loss = 1.0571390390396118
Epoc = 26, Loss = 1.0078426599502563
Epoc = 27, Loss = 0.

In [None]:
""" Train  to capture the pattern in Off_target again"""

for _ in range(2):

    optimizer.zero_grad()
    
    ix = torch.randint(0,len(X_train),(16,)).to(device)
    in_seq = E_M[x[ix]].to(device)
    
    attn_output1= multihead_attn1.forward(in_seq, in_seq, in_seq,need_weights=False)[0]
    attn_output1.retain_grad()
    res_out_normalised1 = torch.nn.functional.normalize((attn_output1 + in_seq),dim=1)

    attn_output2= multihead_attn2.forward(res_out_normalised1, res_out_normalised1, res_out_normalised1,need_weights=False)[0]
    attn_output2.retain_grad()
    res_out_normalised2 = torch.nn.functional.normalize((attn_output2 + res_out_normalised1),dim=1)

    attn_output3= multihead_attn3.forward(res_out_normalised2, res_out_normalised2, res_out_normalised2,need_weights=False)[0]
    attn_output3.retain_grad()
    res_out_normalised3 = torch.nn.functional.normalize((attn_output3 + res_out_normalised2),dim=1)
    
    D0_out = torch.relu(res_out_normalised3.view(len(in_seq),23*35) @ D5)
    
    D1_out = torch.relu(D0_out @ D1 + B1)
    
    D6_out  = torch.relu(D1_out @ D6 + B6)

    D2_out = torch.relu(D6_out @ D2 + B2)

    D3_out = torch.relu(D2_out @ D3 + B3)

    D4_out = torch.relu(D3_out @ D4 + B4)

    OUT_OUT = torch.relu(D4_out @ OUT + B5)

    loss = 0
    for i, ix_i in zip(range(16),ix):
        # loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(y[ix_i]).type(torch.LongTensor).cuda()).to(device)
        loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(y[ix_i]).type(torch.LongTensor)).to(device)
    loss = loss/16
    wt_loss  = 0
    for w  in weights:
        wt_loss+= 0.001*(w**2).mean()
        
    loss += (wt_loss/len(weights))
    print(f"Epoc = {_}, Loss = {loss}")

    loss.backward(retain_graph=True)

    optimizer.step()


Epoc = 0, Loss = 0.7068549990653992
Epoc = 1, Loss = 0.7875596284866333


  loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(y[ix_i]).type(torch.LongTensor)).to(device)


In [None]:
""" Train to capture the pattern of On-Target Sequence"""

for _ in range(10):

    optimizer.zero_grad()

    ix = torch.randint(0,len(X_train),(16,)).to(device) # 16 is batch size
    in_seq = E_M[x[ix]].to(device)
   
    attn_output1= multihead_attn1.forward(in_seq, in_seq, in_seq,need_weights=False)[0]
    attn_output1.retain_grad()
    res_out_normalised1 = torch.nn.functional.normalize((attn_output1 + in_seq),dim=1)

    attn_output2 = multihead_attn2.forward(res_out_normalised1, res_out_normalised1, res_out_normalised1,need_weights=False)[0]
    attn_output2.retain_grad()
    res_out_normalised2 = torch.nn.functional.normalize((attn_output2 + res_out_normalised1),dim=1)

    attn_output3 = multihead_attn3.forward(res_out_normalised2, res_out_normalised2, res_out_normalised2,need_weights=False)[0]
    attn_output3.retain_grad()
    res_out_normalised3 = torch.nn.functional.normalize((attn_output3 + res_out_normalised2),dim=1)
    
    D0_out = torch.relu(res_out_normalised3.view(len(in_seq),23*35) @ D5)
    D1_out = torch.relu(D0_out @ D1 + B1)
    
    D6_out  = torch.relu(D1_out @ D6 + B6)
    # D1_out = torch.relu(D6_out @ D1 + B1)
   
    D2_out = torch.relu(D6_out @ D2 + B2)

    D3_out = torch.relu(D2_out @ D3 + B3)

    D4_out = torch.relu(D3_out @ D4 + B4)

    OUT_OUT = torch.relu(D4_out @ OUT + B5)
    loss = 0
    for i, ix_i in zip(range(16),ix):
        # loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(x[ix_i]).to(device).type(torch.LongTensor).cuda()).to(device)
        loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(x[ix_i]).to(device).type(torch.LongTensor)).to(device)
    loss = loss/16
    wt_loss  = 0
    for w  in weights:
        wt_loss+= 0.001*(w**2).mean()
        
    loss += (wt_loss/len(weights))
    print(f"Epoc = {_}, Loss = {loss}")
    loss.backward(retain_graph=True)
    optimizer.step()



  loss += torch.nn.functional.cross_entropy(OUT_OUT.view(16,23,6)[i],torch.tensor(x[ix_i]).to(device).type(torch.LongTensor)).to(device)


Epoc = 0, Loss = 0.42209097743034363
Epoc = 1, Loss = 0.40243422985076904
Epoc = 2, Loss = 0.2730816602706909
Epoc = 3, Loss = 0.3076552450656891
Epoc = 4, Loss = 0.2245490401983261
Epoc = 5, Loss = 0.3670024871826172
Epoc = 6, Loss = 0.34425264596939087
Epoc = 7, Loss = 0.27415573596954346
Epoc = 8, Loss = 0.31881219148635864
Epoc = 9, Loss = 0.25056540966033936


In [None]:
#predict
index_to_base = {0:'A', 1:'T', 2:'G', 3:'C', 4:'N', 5:'-'}
@torch.no_grad()
def predict(to_predict_seq):
    in_seq = E_M[to_predict_seq].view(1,23,35).to(device)
    attn_output1 = multihead_attn1.forward(in_seq, in_seq, in_seq,need_weights=False)[0]
    res_out_normalised1 = torch.nn.functional.normalize((attn_output1 + in_seq),dim=1)

    attn_output2 = multihead_attn2.forward(res_out_normalised1, res_out_normalised1, res_out_normalised1,need_weights=False)[0]
    res_out_normalised2 = torch.nn.functional.normalize((attn_output2 + res_out_normalised1),dim=1)

    attn_output3 = multihead_attn3.forward(res_out_normalised2, res_out_normalised2, res_out_normalised2,need_weights=False)[0]
    res_out_normalised3 = torch.nn.functional.normalize((attn_output3 + res_out_normalised2),dim=1)
    
    D0_out = torch.relu(res_out_normalised3.view(len(in_seq),23*35) @ D5)
    D1_out = torch.relu(D0_out @ D1 + B1)
    D6_out  = torch.relu(D1_out @ D6 + B6)
    D2_out = torch.relu(D6_out @ D2 + B2)
    D3_out = torch.relu(D2_out @ D3 + B3)
    D4_out = torch.relu(D3_out @ D4 + B4)
    OUT_OUT = torch.relu(D4_out @ OUT + B5) + 0.00001
    # print(OUT_OUT.view(23,6))
    return OUT_OUT.view(23,6)

In [None]:
g1 = torch.Generator(device).manual_seed(21456789)

In [None]:
def get_off_targets(on_target_embedded, number_of_predictions):
    to_predict = on_target_embedded
    predicted_3 = predict(to_predict)
    # print(predicted_3[14])
    # for i in range(0,23):
    #     predicted_3[0][i] = predicted_3[0][i]/predicted_3[0][i].sum(axis=0,keepdims=1)
    # predicted_3 = torch.squeeze(torch.tensor(predicted_3))
    probs = predicted_3 / predicted_3.sum(1,keepdim = True)
    predicted_3 = probs
    # predicted_3 = predicted_3.reshape(23,6)
    # print(predicted_3)
    final_indices = []
    for i in range(23):
        final_indices.append(torch.multinomial(predicted_3[i].to(device), num_samples=number_of_predictions, replacement=True, generator=g1))

    sequences_in_indices = []
    index_i = 0
    for _ in range(len(final_indices[0])):
        to_append_sequence = []
        for i in range(23):
            to_append_sequence.append(final_indices[i][index_i].item())
            if i == 22:
                sequences_in_indices.append(to_append_sequence)
                to_append_sequence = []
                index_i += 1    

    return sequences_in_indices 


In [None]:
""" Predicting Outside the Dataset """
to_predict_one = [base_to_index[i]for i in 'GCATACAGTGATTTGATGAANGG']
num_off = 60000
preds = get_off_targets(to_predict_one,num_off)
dist = []
mismatch_at_indices = []

on_target = ''.join([index_to_base[i] for i in to_predict_one])
to_test = torch.tensor(to_predict_one)
for i in preds:
    dist.append(len(to_test[(to_test) != torch.tensor(i)]))

df_test = pd.DataFrame()
df_test['ontarget'] =  list(itertools.chain(*([[on_target]] * num_off))) 
df_test['offtarget'] = [''.join([index_to_base[j]for j in i]) for i in preds]
df_test['Distance']  = dist
df_test['Distance'].value_counts()   

In [None]:
"""Predicting from the Dataset"""

#df =df[~df['TargetSequence'].isin(['GGCGGCTGCACAACCAGTGGNGG','GCATACAGTGATTTGATGAANGG'])]
to_predict_one = [base_to_index[i]for i in 'GGTGGACAAGCGGCAGATAGNGG']
num_off = 60000
preds = get_off_targets(to_predict_one,num_off)
dist = []
mismatch_at_indices = []

on_target = ''.join([index_to_base[i] for i in to_predict_one])
to_test = torch.tensor(to_predict_one)
for i in preds:
    dist.append(len(to_test[(to_test) != torch.tensor(i)]))

df_test = pd.DataFrame()       
df_test['ontarget'] =  list(itertools.chain(*([[on_target]] * num_off))) 
df_test['offtarget'] = [''.join([index_to_base[j]for j in i]) for i in preds]
df_test['Distance']  = dist
df_test['Distance'].value_counts()                                             