In [212]:
import networkx as nx
import pandas as pd
from functools import partial 

def to_integer(bitstr):
    return int(bitstr, 2)

def Rename_Colors(dict_color_ctr_map, color):
    return dict_color_ctr_map[color]

def Create_Sublineages_Matrix(df):
    row_list = []
    for i, row in df.iterrows():
        K_mer = row['K-Mer']
        sub_lin = row['Sublineages'].split(',')
        bit_val = [1]*len(sub_lin)
        d = dict(zip(sub_lin, bit_val))
        d['K_Mer'] = K_mer
        row_list.append(d)
    df_op = pd.DataFrame(row_list)
    df_op.fillna(0, inplace = True)
    df_op = df_op.set_index('K_Mer')
    df_op.sort_index(axis=1, inplace=True)
    df_op = df_op.astype(int).astype(str)
    df_op['Bitstring'] = df_op.apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    df_op['Color'] = df_op['Bitstring'].apply(to_integer)

    color_unique = df_op['Color'].unique()
    counter = [i for i in range(len(color_unique))]
    color_rename_map = dict(zip(color_unique, counter))
    df_op['Color_new'] = df_op['Color'].apply(partial(Rename_Colors, color_rename_map))
    return df_op

def Return_Lineage_List(row):
    op_list = []
    keys = list(row.keys())
    print(keys)
    for k in keys:
        if k!='Color' and k!='Color_new' and k!='Bitstring':
            if row[k] == "1":
                op_list.append(k)
    print(op_list)
    return op_list

def Prepare_Op_File(df_op):
    data = []
    for (index, row) in df_op.iterrows():
        d = {'Sub_Lineages':Return_Lineage_List(row), 'Color':row['Color_new'], 'K-mer':index}
        data.append(d)
    df_ret = pd.DataFrame(data)
    df_ret.set_index('K-mer', inplace = True)
    return df_ret

In [213]:
df = pd.read_csv('data/HIV_full_Refs_k23_1.kmer_sublineage_info.tsv', sep = '\t', names = ['K-Mer','Sublineages'])

In [214]:
df_op = Create_Sublineages_Matrix(df)

In [215]:
G = nx.read_gexf('data/HIV_full_Refs_k23_1.gexf')
nx.set_node_attributes(G, df_op[['Color_new']].T.to_dict())

In [216]:
df_ret = Prepare_Op_File(df_op)
df_ret.to_csv('data/HIV_full_Refs_k23_1_Color_Table.csv')

['01_AE', '02_AG', '03_AB', '04_cpx', '05_DF', '06_cpx', '07_BC', '08_BC', '09_cpx', '10_CD', '11_cpx', '12_BF', '13_cpx', '14_BG', '15_01B', '16_A2D', '17_BF', '18_cpx', '19_cpx', '20_BG', '21_A2D', '22_01A1', '23_BG', '24_BG', '25_cpx', '26_AU', '27_cpx', '28_BF', '29_BF', '31_BC', '32_06A1', '33_01B', '34_01B', '35_AD', '36_cpx', '37_cpx', '38_BF1', '39_BF', '40_BF', '42_BF', '43_02G', '44_BF', '45_cpx', '46_BF', '47_BF', '49_cpx', 'A1', 'A2', 'B', 'C', 'CPZ', 'D', 'F1', 'F2', 'G', 'H', 'J', 'K', 'N', 'O', 'P', 'Bitstring', 'Color', 'Color_new']
['01_AE', '15_01B', '33_01B']
['01_AE', '02_AG', '03_AB', '04_cpx', '05_DF', '06_cpx', '07_BC', '08_BC', '09_cpx', '10_CD', '11_cpx', '12_BF', '13_cpx', '14_BG', '15_01B', '16_A2D', '17_BF', '18_cpx', '19_cpx', '20_BG', '21_A2D', '22_01A1', '23_BG', '24_BG', '25_cpx', '26_AU', '27_cpx', '28_BF', '29_BF', '31_BC', '32_06A1', '33_01B', '34_01B', '35_AD', '36_cpx', '37_cpx', '38_BF1', '39_BF', '40_BF', '42_BF', '43_02G', '44_BF', '45_cpx', '46_

['01_AE', '15_01B', '17_BF', '19_cpx', '33_01B', '34_01B', '37_cpx']
['01_AE', '02_AG', '03_AB', '04_cpx', '05_DF', '06_cpx', '07_BC', '08_BC', '09_cpx', '10_CD', '11_cpx', '12_BF', '13_cpx', '14_BG', '15_01B', '16_A2D', '17_BF', '18_cpx', '19_cpx', '20_BG', '21_A2D', '22_01A1', '23_BG', '24_BG', '25_cpx', '26_AU', '27_cpx', '28_BF', '29_BF', '31_BC', '32_06A1', '33_01B', '34_01B', '35_AD', '36_cpx', '37_cpx', '38_BF1', '39_BF', '40_BF', '42_BF', '43_02G', '44_BF', '45_cpx', '46_BF', '47_BF', '49_cpx', 'A1', 'A2', 'B', 'C', 'CPZ', 'D', 'F1', 'F2', 'G', 'H', 'J', 'K', 'N', 'O', 'P', 'Bitstring', 'Color', 'Color_new']
['01_AE', '15_01B', '22_01A1', '33_01B', '34_01B']
['01_AE', '02_AG', '03_AB', '04_cpx', '05_DF', '06_cpx', '07_BC', '08_BC', '09_cpx', '10_CD', '11_cpx', '12_BF', '13_cpx', '14_BG', '15_01B', '16_A2D', '17_BF', '18_cpx', '19_cpx', '20_BG', '21_A2D', '22_01A1', '23_BG', '24_BG', '25_cpx', '26_AU', '27_cpx', '28_BF', '29_BF', '31_BC', '32_06A1', '33_01B', '34_01B', '35_AD', 

In [189]:
nx.write_gexf(G, "data/HIV_full_Refs_k23_1_Color_Annotated.gexf")

In [190]:
df_ret.head()

Unnamed: 0_level_0,Color,Sub_Lineages
K-mer,Unnamed: 1_level_1,Unnamed: 2_level_1
AAAAACAGGAAAATATGCCAGAA,0,[]
AAAAAGGACAGCACCAAATGGAG,1,[]
AAAAATCTTAGAGCCCTTTAGAA,2,[]
AAAACTGGATGACAGAAACCTTG,3,[]
AAAAGCAGGGTATGTCACTGACA,4,[]
