In [1]:
import pandas as pd
from polyleven import levenshtein
from multiprocessing import Pool
from functools import partial
import os
import pickle
import sys

In [2]:
def compute_sample_distances(i, seqs, dist_function):
    results = []
    for j in range(i + 1, len(seqs)):
        edit_dist = dist_function(seqs[i], seqs[j], 1)
        if edit_dist in {1, 2}:
            results.append(frozenset([i, j]))
    return results
            
def compute_dists(dist_function):
    partial_sample_dists = partial(compute_sample_distances, dist_function=dist_function, seqs=seqs)
    with Pool(80) as p:
        results_all = p.map(partial_sample_dists, list(range(len(seqs))))
    return results_all

In [3]:
input_file = 'dataset.csv'
output_file = 'single_and_double_mutants.pkl'
lk_index = 0

In [4]:
df = pd.read_csv(input_file, index_col=0)

In [5]:
df

Unnamed: 0,count,seq,lk
0,17596,AGACATGTTTTTTTAGTATGTTGT,0
1,16261,AAACATGTTTTTTTAGTATGTTGT,0
2,11069,AAACATGTTTTTTAAGTATGTTGT,5
3,10477,AGACATGTTATTTTAGTATGTTGT,0
4,10192,AAACATGTTTTTATAGTATGTTGT,0
...,...,...,...
7503788,1,AAAAACACGAAGAACAAGTACTTT,2630
7503789,1,AAAAACACGAAGAACAAGCATTTC,3009
7503790,1,AAAAACACGAAGAACAAATGTTTC,7354
7503791,1,AAAAACACGAAGAACAAACGTTTT,2630


In [6]:
local_king_indexes = sorted(df['lk'].unique().tolist())
lk = local_king_indexes[lk_index]

In [7]:
df_lk = df[df['lk'] == lk]#.head(5000) # !! This .head(5000) needs to be deleted in the real run
seqs = df_lk['seq'].values
ids = df_lk.index

In [None]:
single_double_mutants = compute_dists(levenshtein)

In [None]:
with open(output_file, 'wb') as f:
    pickle.dump(single_double_mutants, f)