In [1]:
import pandas as pd
import random

### Write function to create dummy data

In [3]:
def create_dummy_data(list_of_colnames, row_num):

    # Create list of list where each sublist is a row containing 1s & 0s
    rows = [[random.randint(0, 1) for i in range(len(list_of_colnames))] for i in range(row_num)]
    # Using nested list of rows and list of column names create dataframe
    dataframe = pd.DataFrame(rows, columns=list_of_colnames)

    # Add userId column relevant for Biobank data
    dataframe.insert(0, 'userId', list(range(row_num)))

    return dataframe

### Write function to turn dataframe to tsv

In [4]:
def df_to_tsv(dataframe, filename):

    dataframe.to_csv(filename, sep='\t', index=False)

    return f'{filename} created.'

### Create actual dummy disease data

In [5]:
# Create list of column headers
columns = [f'disease_{i}' for i in range(10)]

# Create dataframe with 50,000 rows
df = create_dummy_data(columns, 50000)

### Write dummy data to tsv file

In [6]:
# df_to_tsv(df, '~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv')

---
## Test out exploratory functions on dummy data

### Read in dummy data tsv

In [20]:
# Function to read in dataframe from tsv
def tsv_to_df(tsv, index_col=False):
    data = pd.read_csv(tsv, sep='\t', header=0, index_col=index_col)  # Read in tsv
    return data

In [21]:
# Read in df from tsv
data = tsv_to_df('~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv')

### Find the frequency of each column (disease)

In [9]:
def all_col_frequency(tsv_file, target_value, index_col=False):  # Return tsv with frequency of all columns

    data = pd.read_csv(tsv_file, sep='\t', header=0, index_col=index_col)  # Read in tsv
    row_count = data.shape[0]  # Row count
    # List of column frequencies
    out_df_rows = [[p.count(target_value)/row_count] for p in [list(data[i]) for i in data.columns]]
    # Dataframe with column name as index and freq as column
    out_df = pd.DataFrame(out_df_rows, columns=['Frequency'])
    out_df.insert(0, 'diseases', list(data.columns))

    return out_df


In [10]:
data_freq = all_col_frequency('~/Desktop/GitHub/ThyCa_Multimorbidity_UKBB/data_exploration/disease_dummy_data.tsv', 1, 0)

In [11]:
data_freq

Unnamed: 0,diseases,Frequency
0,disease_0,0.49518
1,disease_1,0.4988
2,disease_2,0.501
3,disease_3,0.49838
4,disease_4,0.50184
5,disease_5,0.49554
6,disease_6,0.4974
7,disease_7,0.49822
8,disease_8,0.50094
9,disease_9,0.50022


In [12]:
dictionary = pd.DataFrame({'FieldID': ['disease_0', 'disease_7', 'disease_8', 'disease_2', 'disease_1', 'disease_9', 'disease_3', 'disease_10', 'disease_4', 'disease_5', 'disease_6'],
                           'Field': [0, 7, 8, 2, 1, 9, 3, 10, 4, 5, 6]})

In [13]:
def phecode_conversion(data, dictionary_df, replace_col, key_col, new_col, index_col=False):

    dataframe = data
    r = list(dataframe[replace_col])
    new = []

    for item in r:

        for key, value in zip(list(dictionary_df[key_col]), list(dictionary_df[new_col])):
            if item == key:
                new.append(value)
            else:
                pass

    dataframe[replace_col] = new

    return dataframe



In [14]:
d = phecode_conversion(data_freq, dictionary, 'diseases', 'FieldID', 'Field')

In [15]:
d

Unnamed: 0,diseases,Frequency
0,0,0.49518
1,1,0.4988
2,2,0.501
3,3,0.49838
4,4,0.50184
5,5,0.49554
6,6,0.4974
7,7,0.49822
8,8,0.50094
9,9,0.50022
