In [95]:
import pandas as pd
import random

### Write function to create dummy data

In [96]:
def create_dummy_data(list_of_colnames, row_num):

    # Create list of list where each sublist is a row containing 1s & 0s
    rows = [[random.randint(0, 1) for i in range(len(list_of_colnames))] for i in range(row_num)]
    # Using nested list of rows and list of column names create dataframe
    dataframe = pd.DataFrame(rows, columns=list_of_colnames)

    # Add userId column relevant for Biobank data
    dataframe.insert(0, 'userId', list(range(row_num)))

    return dataframe

### Write function to turn dataframe to tsv

In [97]:
def df_to_tsv(dataframe, filename):

    dataframe.to_csv(filename, sep='\t', index=False)

    return f'{filename} created.'

### Create actual dummy disease data

In [98]:
# Create list of column headers
columns = [f'disease_{i}' for i in range(10)]

# Create dataframe with 50,000 rows
df = create_dummy_data(columns, 50000)

### Write dummy data to tsv file

In [99]:
# df_to_tsv(df, '~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv')

---
## Test out exploratory functions on dummy data

### Read in dummy data tsv

In [100]:
# Function to read in dataframe from tsv
def tsv_to_df(tsv, index_col=False):
    data = pd.read_csv(tsv, sep='\t', header=0, index_col=index_col)  # Read in tsv
    return data

In [101]:
# Read in df from tsv
data = tsv_to_df('~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv')

### Find the frequency of each column (disease)

In [120]:
# Function for frequency of each column in dataframe (assuming binary data)
def all_col_frequency(tsv_file, target_value, index_col=False):  # Return tsv with frequency of all columns

    data = pd.read_csv(tsv_file, sep='\t', header=0, index_col=index_col)  # Read in tsv
    row_count = data.shape[0]  # Row count
    # List of column frequencies
    out_df_rows = [[p.count(target_value)/row_count] for p in [list(data[i]) for i in data.columns]]
    # Dataframe with two columns disease name and frequency
    out_df = pd.DataFrame(out_df_rows, columns=['frequency'])
    out_df.insert(0, 'diseases', list(data.columns))

    return out_df


In [121]:
# Output frequency dataframe, index_col set to userId so userId is ignored
data_freq = all_col_frequency('~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv', 1, 0)

In [122]:
data_freq

Unnamed: 0,diseases,frequency
0,disease_0,0.49518
1,disease_1,0.4988
2,disease_2,0.501
3,disease_3,0.49838
4,disease_4,0.50184
5,disease_5,0.49554
6,disease_6,0.4974
7,disease_7,0.49822
8,disease_8,0.50094
9,disease_9,0.50022


### Create dummy dictionary dataframe (contains phecodes & disease names)

In [105]:
# Order different to data_freq for better testing
dictionary = pd.DataFrame({'FieldID': ['disease_0', 'disease_7', 'disease_8', 'disease_2', 'disease_1', 'disease_9',
                                       'disease_3', 'disease_10', 'disease_4', 'disease_5', 'disease_6'],
                           'Field': [0, 7, 8, 2, 1, 9, 3, 10, 4, 5, 6]})

### Convert phecodes to disease names

In [106]:
# Function for converting phecodes to disease names. Works when disease names contained in a column.
def phecode_col_conversion(data, dictionary_df, replace_col, key_col, new_col, index_col=False):

    dataframe = data # re-assign dataframe
    dataframe = dataframe.drop(0, axis=0)  # Drop userId
    r = list(dataframe[replace_col])  # Create a list of phecodes in existing df
    new = []  # New column entries (disease names)

    for item in r:  # Iterate over phecodes
        # Iterate over dictionary df keys (phecodes) and values (disease names)
        for key, value in zip(list(dictionary_df[key_col]), list(dictionary_df[new_col])):
            if item == key:  # If phecode in current df
                new.append(value)  # Add corresponding disease name to new list
            else:  # If not pass
                pass

    dataframe[replace_col] = new  # Replace phecodes in disease column with disease names

    return dataframe  # Return new dataframe


In [107]:
# Function for converting phecodes to disease names. Works when disease names are the headers.
def phecode_header_conversion(data, dictionary_df, key_col, new_col, index_col=False):

    dataframe = data # re-assign dataframe
    dataframe = dataframe.drop(0, axis=1)  # Drop userId
    r = list(dataframe.columns)  # Create a list of phecodes in existing df
    new = []  # New column entries (disease names)

    for item in r:  # Iterate over phecodes
        # Iterate over dictionary df keys (phecodes) and values (disease names)
        for key, value in zip(list(dictionary_df[key_col]), list(dictionary_df[new_col])):
            if item == key:  # If phecode in current df
                new.append(value)  # Add corresponding disease name to new list
            else:  # If not pass
                pass

    dataframe.set_axi(new, 1, inplace=True)   # Replace phecodes in disease column with disease names

    return dataframe  # Return new dataframe

In [108]:
 disease_name_freq = phecode_col_conversion(data_freq, dictionary, 'diseases', 'FieldID', 'Field')

In [109]:
disease_name_freq

Unnamed: 0,diseases,frequency
0,0,0.49518
1,1,0.4988
2,2,0.501
3,3,0.49838
4,4,0.50184
5,5,0.49554
6,6,0.4974
7,7,0.49822
8,8,0.50094
9,9,0.50022


### Find the frequency and count for specific disease

In [110]:
def find_col_frequency(tsv_file, colname, target_value):  # Frequency of column value

    data = pd.read_csv(tsv_file, sep='\t', header=0, index_col=0)  # Read in tsv
    participant_num = len(list(data[colname]))  # Number of rows/participants
    col_of_interest = list(data[colname])  # Turn target column into list
    count = col_of_interest.count(target_value)  # Count number of target column value (e.g. 1) in list
    frequency = count/participant_num  # Find frequency of target value

    return count, frequency  # Return value count and frequency

In [111]:
d0_count, d0_freq = find_col_frequency('~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv',
                                       'disease_0', 1)

In [112]:
d0_count

24759

In [113]:
d0_freq

0.49518

### Subset dataframe - e.g. select only participants with ThyCa

In [114]:
# Select subset of dataframe based on column values
def subset_df(dataframe, col_of_interest, col_value):
    df = dataframe.loc[dataframe[col_of_interest] == col_value]  # Select rows based on column value
    return df  # Return dataframe subset

In [115]:
disease_1_df = subset_df(data, 'disease_1', 1)

In [116]:
disease_1_df

Unnamed: 0,userId,disease_0,disease_1,disease_2,disease_3,disease_4,disease_5,disease_6,disease_7,disease_8,disease_9
0,0,0,1,0,1,1,1,0,1,1,0
1,1,0,1,1,1,0,0,1,0,0,1
2,2,0,1,1,0,1,0,1,1,0,1
5,5,1,1,0,0,0,0,1,1,1,1
6,6,1,1,1,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
49988,49988,1,1,0,0,1,0,1,0,0,0
49989,49989,0,1,1,1,0,0,1,0,0,0
49992,49992,1,1,1,1,0,1,0,1,1,0
49995,49995,0,1,1,1,0,1,0,0,0,1


### Subset dataframe based on items in a list

In [125]:
# Select subset of dataframe based on list
def subset_df_list(dataframe, col_of_interest, col_value_list):
    df = dataframe.loc[dataframe[col_of_interest].isin(col_value_list)]  # Select rows based on column value
    return df  # Return dataframe subset

In [126]:
# Select disease_1 patients using list of their userIds
disease_1_df_v2 = subset_df_list(data, 'userId', list(disease_1_df['userId']))

In [127]:
disease_1_df_v2

Unnamed: 0,userId,disease_0,disease_1,disease_2,disease_3,disease_4,disease_5,disease_6,disease_7,disease_8,disease_9
0,0,0,1,0,1,1,1,0,1,1,0
1,1,0,1,1,1,0,0,1,0,0,1
2,2,0,1,1,0,1,0,1,1,0,1
5,5,1,1,0,0,0,0,1,1,1,1
6,6,1,1,1,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
49988,49988,1,1,0,0,1,0,1,0,0,0
49989,49989,0,1,1,1,0,0,1,0,0,0
49992,49992,1,1,1,1,0,1,0,1,1,0
49995,49995,0,1,1,1,0,1,0,0,0,1
