In [9]:
# Inputs:
dataset_file = 'dataset.csv'

# Outputs:
distance_matrix_file = 'distance_matrix.csv'

# Params:
nrows = 10000

In [10]:
import pandas as pd
import numpy as np
from polyleven import levenshtein
from multiprocessing import Pool
from functools import partial

In [11]:
def compute_sample_distances(i, seqs, dist_function):
    results = []
    for j in range(i + 1, len(seqs)):
        edit_dist = dist_function(seqs[i], seqs[j], 1)
        results.append((i, j, edit_dist))
    return results
            
def compute_dists(dist_function):
    partial_sample_dists = partial(compute_sample_distances,
                                   dist_function=dist_function,
                                   seqs=seqs)
    with Pool(2) as p:
        results_all = p.map(partial_sample_dists, list(range(len(seqs))))
        
    dist_mat = np.zeros((len(seqs), len(seqs)))
    for row_result in results_all:
        for i, j, dist in row_result:
            dist_mat[i, j] = dist
            dist_mat[j, i] = dist
    return dist_mat

In [13]:
df = pd.read_csv(dataset_file, index_col=0, nrows=nrows)
seqs = df['seq'].values
ids = df.index

In [16]:
dist_mat = compute_dists(levenshtein)
df_dist_mat = pd.DataFrame(dist_mat, index=ids, columns=ids)

In [18]:
df_dist_mat.to_csv(distance_matrix_file)