In [217]:
import networkx as nx
import pandas as pd
from functools import partial 

def to_integer(bitstr):
    return int(bitstr, 2)

def Rename_Colors(dict_color_ctr_map, color):
    return dict_color_ctr_map[color]

def Create_Sublineages_Matrix(df):
    row_list = []
    for i, row in df.iterrows():
        K_mer = row['K-Mer']
        sub_lin = row['Sublineages'].split(',')
        bit_val = [1]*len(sub_lin)
        d = dict(zip(sub_lin, bit_val))
        d['K_Mer'] = K_mer
        row_list.append(d)
    df_op = pd.DataFrame(row_list)
    df_op.fillna(0, inplace = True)
    df_op = df_op.set_index('K_Mer')
    df_op.sort_index(axis=1, inplace=True)
    df_op = df_op.astype(int).astype(str)
    df_op['Bitstring'] = df_op.apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    df_op['Color'] = df_op['Bitstring'].apply(to_integer)

    color_unique = df_op['Color'].unique()
    counter = [i for i in range(len(color_unique))]
    color_rename_map = dict(zip(color_unique, counter))
    df_op['Color_new'] = df_op['Color'].apply(partial(Rename_Colors, color_rename_map))
    return df_op

def Return_Lineage_List(row):
    op_list = []
    keys = list(row.keys())
    for k in keys:
        if k!='Color' and k!='Color_new' and k!='Bitstring':
            if row[k] == "1":
                op_list.append(k)
    return op_list

def Prepare_Op_File(df_op):
    data = []
    for (index, row) in df_op.iterrows():
        d = {'Sub_Lineages':Return_Lineage_List(row), 'Color':row['Color_new'], 'K-mer':index}
        data.append(d)
    df_ret = pd.DataFrame(data)
    df_ret.set_index('K-mer', inplace = True)
    return df_ret

In [218]:
df = pd.read_csv('data/HIV_full_Refs_k23_1.kmer_sublineage_info.tsv', sep = '\t', names = ['K-Mer','Sublineages'])

In [219]:
df_op = Create_Sublineages_Matrix(df)

In [220]:
G = nx.read_gexf('data/HIV_full_Refs_k23_1.gexf')
nx.set_node_attributes(G, df_op[['Color_new']].T.to_dict())

In [221]:
df_ret = Prepare_Op_File(df_op)
df_ret.to_csv('data/HIV_full_Refs_k23_1_Color_Table.csv')

In [222]:
nx.write_gexf(G, "data/HIV_full_Refs_k23_1_Color_Annotated.gexf")

In [223]:
df_ret.head()

Unnamed: 0_level_0,Color,Sub_Lineages
K-mer,Unnamed: 1_level_1,Unnamed: 2_level_1
AAAAACAGGAAAATATGCCAGAA,0,"[01_AE, 15_01B, 33_01B]"
AAAAAGGACAGCACCAAATGGAG,1,"[01_AE, 15_01B, CPZ]"
AAAAATCTTAGAGCCCTTTAGAA,2,"[01_AE, 02_AG, 04_cpx, 05_DF, 06_cpx, 09_cpx, ..."
AAAACTGGATGACAGAAACCTTG,3,"[01_AE, 02_AG, 05_DF, 15_01B, 25_cpx, 34_01B, F2]"
AAAAGCAGGGTATGTCACTGACA,4,"[01_AE, 02_AG, 11_cpx, 15_01B, 25_cpx, 27_cpx,..."
