In [169]:
import networkx as nx
import pandas as pd
from functools import partial 

def to_integer(bitstr):
    return int(bitstr, 2)

def Rename_Colors(dict_color_ctr_map, color):
    return dict_color_ctr_map[color]

def Create_Sublineages_Matrix(df):
    row_list = []
    for i, row in df.iterrows():
        K_mer = row['K-Mer']
        sub_lin = row['Sublineages'].split(',')
        bit_val = [1]*len(sub_lin)
        d = dict(zip(sub_lin, bit_val))
        d['K_Mer'] = K_mer
        row_list.append(d)
    df_op = pd.DataFrame(row_list)
    df_op.fillna(0, inplace = True)
    df_op = df_op.set_index('K_Mer')
    df_op.sort_index(axis=1, inplace=True)
    df_op = df_op.astype(int).astype(str)
    df_op['Bitstring'] = df_op.apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    df_op['Color'] = df_op['Bitstring'].apply(to_integer)

    color_unique = df_op['Color'].unique()
    counter = [i for i in range(len(color_unique))]
    color_rename_map = dict(zip(color_unique, counter))
    df_op['Color_new'] = df_op['Color'].apply(partial(Rename_Colors, color_rename_map))
    return df_op

def Return_Lineage_List(row):
    op_list = []
    keys = row.keys()
    print(row)
    for k in keys:
        if k!='Color' and k!='color_new' and k!='Bitstring':
            if row[k] == 1:
                op_list.append(k)
    return op_list

def Prepare_Op_File(df_op):
    df_op['Sub_Lineages'] = df_op.apply(Return_Lineage_List)
    return df_op[['Sub_Lineages', 'Color_new']]

In [170]:
df = pd.read_csv('data/HIV_full_Refs_k23_1.kmer_sublineage_info.tsv', sep = '\t', names = ['K-Mer','Sublineages'])

In [171]:
df_op = Create_Sublineages_Matrix(df)

In [172]:
G = nx.read_gexf('data/HIV_full_Refs_k23_1.gexf')
nx.set_node_attributes(G, df_op[['Color_new']].T.to_dict())

In [173]:
df_op = Prepare_Op_File(df_op)
df_op.to_csv('data/HIV_full_Refs_k23_1_Color_Table.csv')

K_Mer
AAAAACAGGAAAATATGCCAGAA    1
AAAAAGGACAGCACCAAATGGAG    1
AAAAATCTTAGAGCCCTTTAGAA    1
AAAACTGGATGACAGAAACCTTG    1
AAAAGCAGGGTATGTCACTGACA    1
AAAAGGGGGGACTGGATGGGCTA    1
AAAAGGTGGATAATCCTGGGATT    1
AAAATAGCCACAGAAAGCATAGT    1
AAAATTGGGCCTGAAAATCCATA    1
AAACTCATCTGCACCACTAATGT    0
AAAGACACAGCAGGCAGCAGCTG    1
AAAGACCCCATAGTAGGAGCAGA    1
AAAGAGCAGTTGGACTGGGAGCT    0
AAAGGTAGTACCAAGAAGAAAAG    0
AAAGTAATACACACAGACAACGG    1
AAAGTCATCCTGGTAGCAGTCCA    1
AAATAAGAAAACAGAAGAAAATA    0
AAATAGATCCTGACCTAGCAGAC    1
AAATCACTCTTTGGCAACGACCC    1
AAATTAGTAGATTTCAGAGAGCT    1
AAATTGTAATACCTCAGCCATTA    1
AACAAGTAGATAAATTAGTCAGT    1
AACAATGAATGGACATTAGAACT    1
AACTAGAGATCCCTCAGACCACT    1
AACTCCTCAGGGGAGCTAAAGCA    1
AACTGTGGCAAAGAAGGACACCT    1
AACTTAAATCATTATTTAATACA    1
AAGAACAAATAGGATGGATGACA    1
AAGAACTTAAATCATTATTTAAT    1
AAGAAGCAGAAGGTCCATGCACT    1
                          ..
TGGCCAAACTTGTGGAAATGGGG    1
TGGCCGGGGCGGAGTTGGGGAGT    1
TGGGGATACTTGGGAAGGGGTTG    1
TGGGGCTG

Name: Bitstring, Length: 420, dtype: object
K_Mer
AAAAACAGGAAAATATGCCAGAA    1152991873887895552
AAAAAGGACAGCACCAAATGGAG    1152991873351025664
AAAAATCTTAGAGCCCTTTAGAA    1986343622752089064
AAAACTGGATGACAGAAACCTTG    1801510288680288384
AAAAGCAGGGTATGTCACTGACA    1730578612132274192
AAAAGGGGGGACTGGATGGGCTA    1406797624447606856
AAAAGGTGGATAATCCTGGGATT    1152992458338992128
AAAATAGCCACAGAAAGCATAGT    1225049493968272000
AAAATTGGGCCTGAAAATCCATA    2222380060764010841
AAACTCATCTGCACCACTAATGT     799110147629531640
AAAGACACAGCAGGCAGCAGCTG    1874693748904181760
AAAGACCCCATAGTAGGAGCAGA    1729452626060296192
AAAGAGCAGTTGGACTGGGAGCT     612527999044296768
AAAGGTAGTACCAAGAAGAAAAG     835058299028775640
AAAGTAATACACACAGACAACGG    1152991874156331008
AAAGTCATCCTGGTAGCAGTCCA    1152991874156363776
AAATAAGAAAACAGAAGAAAATA     576601489791778816
AAATAGATCCTGACCTAGCAGAC    1729454962989334592
AAATCACTCTTTGGCAACGACCC    1826901859070080936
AAATTAGTAGATTTCAGAGAGCT    1301648156420751552
AAATTGTAAT

In [157]:
nx.write_gexf(G, "data/HIV_full_Refs_k23_1_Color_Annotated.gexf")

In [158]:
df_op.head()

Unnamed: 0_level_0,Sub_Lineages,Color_new
K_Mer,Unnamed: 1_level_1,Unnamed: 2_level_1
AAAAACAGGAAAATATGCCAGAA,,0
AAAAAGGACAGCACCAAATGGAG,,1
AAAAATCTTAGAGCCCTTTAGAA,,2
AAAACTGGATGACAGAAACCTTG,,3
AAAAGCAGGGTATGTCACTGACA,,4
