In [16]:
#%%writefile ../../src/data/data_utils.py
# %load ../../src/data/data_utils.py
# %%writefile ../../src/data/data_utils.py

"""
Author: Jim Clauwaert
Created in the scope of my PhD
"""


import pandas as pd
import numpy as np
import itertools

def CreatePairwiseRankData(dfDataset):
    
    """Create pairwise ranked data from dataset. All possible combinations are included.
    (see README for default dataset layout)
    
    Parameters
    -----------
    dfDataset : DataFrame
        Dataframe containing at least 'ID', 'sequence', 'mean_score', '35boxstart' and '10boxstart'. 
        'mean_score_sd' is an optional column
    Returns
    --------
    DF : Dataframe 
        Dataframe containing paired data with arguments found in original dataframe (subscripted with '_1' and '_2')
        and rank. Rank is defined as 1 for samples in which 'mean_score_1' > 'mean_score_2' and -1 in other cases
    """
    
    sampleCount = dfDataset.shape[0]
    
    DF = pd.DataFrame(index=range(int(sampleCount*(sampleCount-1)/2)), 
                  columns=[])
    
    ZIP = list(itertools.combinations(dfDataset['ID'],2))

    DF['ID_1'] = [item[0] for item in ZIP]
    DF['ID_2'] = [item[1] for item in ZIP]

    DF['sequence_1'] = [dfDataset[dfDataset['ID']==x]['sequence'].values[0] for x in DF['ID_1']]
    DF['sequence_2'] = [dfDataset[dfDataset['ID']==x]['sequence'].values[0] for x in DF['ID_2']]

    DF['mean_score_1'] = [dfDataset[dfDataset['ID']==x]['mean_score'].values[0] for x in DF['ID_1']]
    DF['mean_score_2'] = [dfDataset[dfDataset['ID']==x]['mean_score'].values[0] for x in DF['ID_2']]
    
    if 'mean_score_sd' in dfDataset.columns:
        DF['mean_score_sd_1'] = [dfDataset[dfDataset['ID']==x]['mean_score_sd'].values[0] for x in DF['ID_1']]
        DF['mean_score_sd_2'] = [dfDataset[dfDataset['ID']==x]['mean_score_sd'].values[0] for x in DF['ID_2']]

    DF['35boxstart_1'] = [dfDataset[dfDataset['ID']==x]['35boxstart'].values[0] for x in DF['ID_1']]
    DF['35boxstart_2'] = [dfDataset[dfDataset['ID']==x]['35boxstart'].values[0] for x in DF['ID_2']]

    DF['10boxstart_1'] = [dfDataset[dfDataset['ID']==x]['10boxstart'].values[0] for x in DF['ID_1']]
    DF['10boxstart_2'] = [dfDataset[dfDataset['ID']==x]['10boxstart'].values[0] for x in DF['ID_2']]

    DF['rank'] = [1 if x>(DF.iloc[i]['mean_score_2']) else -1 for i, x in enumerate(DF['mean_score_1']) ]
    
    return DF



Writing ../../src/data/data_utils.py


In [14]:
dataset = pd.read_csv("../../data/external/anderson_lib.csv")
test = CreatePairwiseRankData(dataset)

In [15]:
test

Unnamed: 0,ID_1,ID_2,sequence_1,sequence_2,mean_score_1,mean_score_2,35boxstart_1,35boxstart_2,10boxstart_1,10boxstart_2,rank
0,BBa_J23100,BBa_J23101,ttgacggctagctcagtcctaggtacagtgctagc,tttacagctagctcagtcctaggtattatgctagc,1.00,0.70,0,0,23,23,1
1,BBa_J23100,BBa_J23102,ttgacggctagctcagtcctaggtacagtgctagc,ttgacagctagctcagtcctaggtactgtgctagc,1.00,0.86,0,0,23,23,1
2,BBa_J23100,BBa_J23103,ttgacggctagctcagtcctaggtacagtgctagc,ctgatagctagctcagtcctagggattatgctagc,1.00,0.01,0,0,23,23,1
3,BBa_J23100,BBa_J23104,ttgacggctagctcagtcctaggtacagtgctagc,ttgacagctagctcagtcctaggtattgtgctagc,1.00,0.72,0,0,23,23,1
4,BBa_J23100,BBa_J23105,ttgacggctagctcagtcctaggtacagtgctagc,tttacggctagctcagtcctaggtactatgctagc,1.00,0.24,0,0,23,23,1
5,BBa_J23100,BBa_J23106,ttgacggctagctcagtcctaggtacagtgctagc,tttacggctagctcagtcctaggtatagtgctagc,1.00,0.47,0,0,23,23,1
6,BBa_J23100,BBa_J23107,ttgacggctagctcagtcctaggtacagtgctagc,tttacggctagctcagccctaggtattatgctagc,1.00,0.36,0,0,23,23,1
7,BBa_J23100,BBa_J23108,ttgacggctagctcagtcctaggtacagtgctagc,ctgacagctagctcagtcctaggtataatgctagc,1.00,0.51,0,0,23,23,1
8,BBa_J23100,BBa_J23109,ttgacggctagctcagtcctaggtacagtgctagc,tttacagctagctcagtcctagggactgtgctagc,1.00,0.04,0,0,23,23,1
9,BBa_J23100,BBa_J23110,ttgacggctagctcagtcctaggtacagtgctagc,tttacggctagctcagtcctaggtacaatgctagc,1.00,0.33,0,0,23,23,1


In [2]:
import pandas as pd
test = pd.DataFrame({'0':[1,2,3,4]})

In [4]:
'2' in test.columns

False