# UKB -- ALL CODES

In [None]:
import pandas as pd
import numpy as np

In [None]:
# #Download files -- this file was created in 02_UKB_add_short_name_to_med_data.ipynb
!dx download -r data/drugs_with_short_name_OCT_09_24.csv

# Add drugs codes

In [None]:
# Load the cleaned file of all drugs
drugs = pd.read_csv('drugs_with_short_name_OCT_09_24.csv')
drugs = drugs[['ID', 'issue_date', 'drug_name', 'cleaned_med', 'short_name']]
drugs

In [None]:
#Sort by issue date
drugs = drugs.sort_values(by = 'issue_date')
drugs

In [None]:
# Eliminate any drugs with an "OMIT" short name
drugs = drugs[drugs['short_name'] != 'omit']
drugs

In [None]:
# Eliminate drugs with no or wrong prescription date data
drugs = drugs[~drugs['issue_date'].isna()]
drugs = drugs[drugs['issue_date'] != "Code has event date matching participant's date of birth"]
drugs = drugs[drugs['issue_date'] != "Code has event date in the future and is presumed to be a place-holder or other system default"]
drugs = drugs[drugs['issue_date'] != "Code has event date before participant's date of birth"]
drugs

In [None]:
# Look at the years drugs data is available
drugs[['year', 'month', 'day']] = drugs.issue_date.str.split('-', expand = True)
drugs.year.value_counts()

In [None]:
# For this project, we made the decision to only include medications prescribed in 1999 or later.
drugs['year'] = drugs['year'].astype(int)
drugs = drugs[drugs['year'] > 1998]
drugs

In [None]:
#Remove slash to make it easier for computer to read
drugs['short_name'] = drugs.short_name.str.replace('/', 'and')

In [None]:
# Create list of drugs
drugs_list = list(set(list(drugs['short_name'])))
print(len(drugs_list))

In [None]:
#Save list of drugs in df for future use
df_drugs = drugs[['short_name']]
df_drugs = df_drugs.drop_duplicates(subset = 'short_name', keep = 'first')
df_drugs

In [None]:
df_drugs.to_csv('list_of_cleaned_drugs_n547.csv', header = True, index = False)

In [None]:
! dx upload list_of_cleaned_drugs_n547.csv --path data/list_of_cleaned_drugs_n547.csv

In [None]:
print(len(drugs_list))

In [None]:
drugs

# In this step, we save the drugs in individual csv files by their short name
## We only save files that have at least 10 samples
## These samller csv files are much easier to use than one huge file

In [None]:
has_data = []
less_than_10_samples = []


for drug in drugs_list:
    test = drugs[drugs['short_name'] == drug]
    test = test.rename(columns = {'issue_date': drug + '_DATE'})
    test = test[['ID', drug + '_DATE', 'drug_name', 'cleaned_med', 'short_name']]
    test2 = pd.DataFrame(test.ID.value_counts()).reset_index()
    test2 = test2.rename(columns = {'ID':drug + '_N', 'index':'ID'})
    test = test.sort_values(by = drug + '_DATE')
    test = test.drop_duplicates(subset = 'ID', keep = 'first')
    test = test.merge(test2, left_on = 'ID', right_on = 'ID', how = 'left')
    test = test[~test[drug + '_DATE'].isna()]
    if len(test) > 9:
        test.to_csv(f'{drug}_with_date.csv', header = True, index = False)
        ! dx upload {drug}_with_date.csv --path data/new_drug_csv/{drug}_with_date.csv
        has_data.append(drug)
        print(drug)
    else:
        less_than_10_samples.append(drug)

In [None]:
print(len(has_data))
print(len(less_than_10_samples))

In [None]:
print(has_data)

In [None]:
# These should be saved OR you would immediately proceed to step 04 where we use these files to prep the next dataframe.

# Corrections: paracetamol and codeine
## These two drugs often occur in cominbation with other drugs -- so we did some additional cleaning to remove obvious combination drugs

In [None]:
import pandas as pd
drug = 'codeine' # we also ran this code for paracetamol
test = pd.read_csv(f'{drug}_with_date.csv')
test['contains_test'] = test['drug_name'].str.contains('+', case=False, na=False, regex = False)
test

In [None]:
test = test[test['contains_test'] == False]
test

In [None]:
test['contains_test2'] = test['drug_name'].str.contains('paracetamol', case=False, na=False, regex = False)
test

In [None]:
test = test[test['contains_test2'] == False]
test

In [None]:
test.to_csv(f'{drug}_with_date.csv', header = True, index = None)

In [None]:
! dx upload {drug}_with_date.csv --path data/new_drug_csv/{drug}_with_date.csv

In [None]:
import pandas as pd
test = pd.read_csv('codeine_with_date.csv')
#test = test.sort_values(by = 'amoxicillin_DATE')
test

In [None]:
test.drug_name.value_counts()