In [1]:
# Inputs:
distance_matrix_file = 'distance_matrix.csv'
dataset_file = 'dataset.csv'

# Outputs:
local_kings_output_file = 'local_kings.csv'

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
def getTP(i):
    '''
    Get topological prominence of a sequence with an index i using a distance matrix D
    '''
    
    max_altitudes = []
    m = 0
    
    while True:
        m += 1
        n_mutants = df_distances.loc[df_distances.loc[:, i] == m].index # Get sequences m mutations away
        n_mutants_counts = df_sub.loc[n_mutants, 'count'] # Get the counts of these m-mutants
    
        if n_mutants_counts.max() >= df_sub.loc[i, 'count']: # If the largest count of the m-mutants is larger of the current seuquence with index i, 
            if m == 1:
                return (0, m) # If value of m is one, that means the prominence is 0
            else:
                return (df_sub.loc[i]['count'] - min(max_altitudes), m) # If the value of m is at this stop point not 1, then the prominence is the readnumber of the current sequence minus the minimum of the max_altitudes.
        else:
            max_altitudes.append(n_mutants_counts.max())

---

In [10]:
# Read the distnace matrix
df_distances = pd.read_csv(distance_matrix_file, index_col=0)
df_distances.columns=[int(x) for x in df_distances.columns]

# Read the top rows of the original dataset
df_sub = pd.read_csv(dataset_file, nrows=len(df_distances), index_col=0)

In [11]:
df_sub.head()

Unnamed: 0,count,seq,lk
0,17596,AGACATGTTTTTTTAGTATGTTGT,0
1,16261,AAACATGTTTTTTTAGTATGTTGT,0
2,11069,AAACATGTTTTTTAAGTATGTTGT,0
3,10477,AGACATGTTATTTTAGTATGTTGT,0
4,10192,AAACATGTTTTTATAGTATGTTGT,0


In [12]:
# Calculate topological prominence for each of the sequences.
df_tp_res = pd.DataFrame([getTP(i) for i in tqdm(range(1, len(df_sub)))], columns=['tp', 'm'])


  0%|                                                                                          | 0/9999 [00:00<?, ?it/s][A
  0%|                                                                                  | 5/9999 [00:00<06:41, 24.89it/s][A
  0%|                                                                                 | 11/9999 [00:00<05:40, 29.36it/s][A
  0%|                                                                                 | 14/9999 [00:00<06:54, 24.12it/s][A
  0%|▎                                                                                | 40/9999 [00:00<02:28, 66.86it/s][A
  0%|▍                                                                                | 47/9999 [00:00<02:55, 56.78it/s][A
  1%|▍                                                                                | 53/9999 [00:01<03:25, 48.33it/s][A
  1%|▍                                                                                | 58/9999 [00:01<04:01, 41.14it/s][A
  1%|▌ 

In [13]:
df_tp_res.loc[-1] = [df_sub.iloc[0]['count'], 0]  # In the following rows I'm just adding the first local king and concatenating the results to df
df_tp_res.index = df_tp_res.index + 1
df_tp_res = df_tp_res.sort_index()
df_sub = pd.concat([df_sub, df_tp_res], axis=1)

In [14]:
# Define local kings
df_lk = df_sub[df_sub['tp'] > 0]

In [15]:
df_lk.head()

Unnamed: 0,count,seq,lk,tp,m
0,17596,AGACATGTTTTTTTAGTATGTTGT,0,17596.0,0
5,10163,AGACATGTTTTGTAAGTATGTTGT,5,1364.0,2
11,8100,AGAAATGTTTAGTATGTATGTTTT,11,811.0,2
12,8091,AGACATGTTTAGTATGTATGTTGT,12,986.0,2
40,5578,AGAAATGTTTTTTTAGTATGTTTT,40,1254.0,2


In [16]:
df_lk.to_csv('local_kings.csv')