Skip to content

Commit

Permalink
Merge pull request #17 from YeChen-IDM/plotting_update_2
Browse files Browse the repository at this point in the history
Translate R plotting scripts into Python and add get_version.py
  • Loading branch information
YeChen-IDM committed Sep 20, 2022
2 parents 0eee8f6 + 6a56055 commit cbb7305
Show file tree
Hide file tree
Showing 82 changed files with 2,903 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ jobs:
run: pip install black flake8

- name: Run linters
run: flake8 --ignore=E114,E115,E116,E126,E127,E128,E201,E202,E203,E222,E225,E226,E231,E241,E251,E261,E262,E265,E301,E302,E303,E306,E501,E502,W291,W292,W293,W391,W504 --exclude=./tests
run: flake8 --ignore=E114,E115,E116,E126,E127,E128,E201,E202,E203,E222,E225,E226,E231,E241,E251,E261,E262,E265,E301,E302,E303,E306,E501,E502,W291,W292,W293,W391,W503 --exclude=./tests
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from plotnine import ggplot, aes, geom_line, geom_point, theme_bw, xlab, ylab, scale_color_manual, facet_wrap
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype


def get_substr(site_name_str, index):
return site_name_str.split("_")[1][index]


def get_mean_from_upper_age(cur_age, upper_ages):
if not upper_ages or cur_age not in upper_ages:
return None
else:
mean_ages = [upper_ages[0] / 2]
for i in range(len(upper_ages) - 1):
mean_ages.append((upper_ages[i] + upper_ages[i + 1]) / 2)
return mean_ages[upper_ages.index(cur_age)]


def plot_inc_ref_sim_comparison(sim_df, ref_df):
"""
Plot age-incidence comparisons with reference
Args:
sim_df ():
ref_df ():
Returns:
"""
sim_df['Incidence'] = sim_df['Incidence'] * sim_df['p_detect_case']

# set up reference and simulation dataset columns
ref_df['Incidence'] = ref_df['INC'] / 1000
ref_df['mean_age'] = (ref_df['INC_LAR'] + ref_df['INC_UAR']) / 2
ref_df = pd.DataFrame(data={'Incidence': ref_df['Incidence'],
'mean_age': ref_df['mean_age'],
'Site': ref_df['Site'],
'Pop_size': ref_df['POP'],
'year': ref_df['START_YEAR']})
sim_df = pd.DataFrame(data={'Incidence': sim_df['Incidence'],
'mean_age': sim_df['mean_age'],
'Site': sim_df['Site'],
'Pop_size': np.nan,
'year': np.nan})

ref_df['source'] = 'reference'
sim_df['source'] = 'simulation'

df_combined = pd.concat(sim_df, ref_df)

source_cat = CategoricalDtype(categories=['reference', 'simulation'], ordered=True)
df_combined['source'] = df_combined['source'].astype(source_cat)

gg = (ggplot(df_combined, aes(x='mean_age', y='Incidence', color='source', group='year'))
+ geom_line()
+ geom_point()
+ scale_color_manual(values={"reference": 'red',
"simulation": 'blue'})
+ xlab('age (midpoint of age bin)')
+ ylab('incidence')
+ facet_wrap('Site', ncol=4)
+ theme_bw())
return gg
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#####################################################################################
# compare_ref_sim_infection_duration_Navrongo
# contact: mambrose
# Jan 2022
#
# Goal: simulate transmission in EMOD and sample to match reference dataset
# Compare:
# - probability a person goes from negative to positive between sample dates
# - probability a person goes from positive to negative between sample dates
# - fraction of samples positive
# - among all individuals starting positive, how long until negative?
# - among all individuals turning positive during study, how long until negative?
#####################################################################################

import numpy as np
import pandas as pd
import os.path as path
import datetime
import math
import random


def get_sim_survey(sim_dir, ref_df, seeds=np.nan):
"""
Get subset of simulation dataset to match reference dataset for dates and ages of individuals of sampled individuals
Args:
sim_dir ():
ref_df ():
seeds ():
Returns:
"""
sampled_sim_filename = 'sim_duration_survey_sampling.csv'
file_path = path.join(sim_dir, sampled_sim_filename)
if path.isfile(file_path):
sim_subset_full = pd.read_csv(file_path)
else:
# Get first year of sampling in reference dataset. the simulation will be referenced from the first day of
# that year
first_ref_date = datetime.date(datetime.datetime.strptime(ref_df['date'].dropna().min(), "%Y-%m-%d").year, 1, 1)
indIDs = ref_df['SID'].unique()

patient_report_path = path.join(sim_dir, 'patient_reports.csv')

sim_full = pd.read_csv(patient_report_path)
sim_full['date'] = first_ref_date + sim_full['simday']
sim_full['age'] = sim_full['age'] / 365

if math.isnan(seeds):
seeds = sim_full['Run_Number'].unique()

for seed in sorted(seeds):
print('Currently on seed ' + seed)
sim = sim_full[sim_full['Run_Number'] == seed] # subset to desired run
# track which individuals have already been included from the simulation (
# to avoid double-sampling simulation individuals)
included_ids = list()
sim_subset = pd.DataFrame()
for ii in range(len(indIDs)):
if ii % 50 == 0:
print('Currently on individual ' + ii + ' out of ', len(indIDs))
ref_df_cur = ref_df[ref_df['SID'] == indIDs[ii]]
ref_df_cur = ref_df_cur.sort_values(by='date')
# find a matching individual
age_cur = ref_df_cur['age'].iloc[0]
day_cur = ref_df_cur['date'].iloc[0]

# use age-specific matches
id_candidates = sim[(sim['date'] == day_cur) & (round(sim['age'] == round(age_cur)))]['id']
id_candidates = [idx not in included_ids for idx in id_candidates]
# if no perfect age-match remain, expand year-range until finding a match
if len(id_candidates) == 0:
year_range = 0
while len(id_candidates) == 0 & year_range < 100:
year_range = year_range + 5
id_candidates = sim[(sim['date'] == day_cur)
& (sim['age'].round().isin(range((round(age_cur)-year_range),
round(age_cur)+year_range)))]['id']
id_candidates = [idx not in included_ids for idx in id_candidates]

if len(id_candidates) == 0:
print('Problem: no age-matched simulation individual found for reference id: ' + indIDs[ii])
else:
print('No exact age match remaining for reference id: ' + indIDs[ii]
+ '. Used simulation individual within ', year_range, ' years.')

id_sim_cur = random.sample(id_candidates, 1) # todo: should we remove this id after drawing?
included_ids.extend(id_sim_cur)

# keep the same simulation dates as the reference samples for this individual
sim_subset_cur = sim[(sim['id'] == id_sim_cur) & (sim['date'].isin(ref_df_cur['date']))]
sim_subset = pd.concat(sim_subset, sim_subset_cur)

sim_subset['seed'] = seed
if seed == sorted(seeds)[1]:
sim_subset_full = sim_subset
else:
sim_subset_full = pd.concat(sim_subset_full, sim_subset)

# rename simulation columns to match reference data
sim_subset_full.rename(columns={'id': 'SID', 'true_asexual_parasites': 'DENSITY'}, inplace=True)
sim_subset_full.to_csv(file_path, index=False)

return sim_subset_full


# def get_frac_state_swaps(data):
# """
# Calculate probability of going from negative to positive or from positive to negative between sample dates
# Args:
# data ():
#
# Returns:
#
# """
# # brute force approach iterating through people and dates
# indIDs = data['SID'].unique()
# sum_denom_pos = 0
# sum_turn_neg = 0
# sum_denom_neg = 0
# sum_turn_pos = 0
#
# for ii in range(len(indIDs)):
# data_cur = data[data['SID'] == indIDs[ii]]
# data_cur = data_cur.sort_values(by=['date'])
# # get indices of positive tests and of negative tests
# ind_pos = which(data_cur$DENSITY > pos_thresh_dens)
# ind_neg = which(data_cur$DENSITY <= pos_thresh_dens)
#
# # denominators for each (number of each type that had an observation after then (i.e., they could have been observed to change))
# last_obs_pos = (data_cur$DENSITY[nrow(data_cur)] > pos_thresh_dens)
# sum_denom_pos = sum_denom_pos + ifelse(last_obs_pos, length(ind_pos)-1, length(ind_pos))
# sum_denom_neg = sum_denom_neg + ifelse(last_obs_pos, length(ind_neg), length(ind_neg)-1)
#
# # find how many tests change from neg to pos or pos to neg across timesteps
# sum_turn_neg = sum_turn_neg + sum((ind_pos + 1) % in % ind_neg)
# sum_turn_pos = sum_turn_pos + sum((ind_neg + 1) % in % ind_pos)
#
#
# frac_pos_turn_neg_next_time = sum_turn_neg / sum_denom_pos
# frac_neg_turn_pos_next_time = sum_turn_pos / sum_denom_neg
#
# return (c(frac_pos_turn_neg_next_time, frac_neg_turn_pos_next_time))
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,11 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
# colors = brewer.pal(n=num_colors, name='BrBG')
# names(colors) = sorted(combined_df['densitybin'].unique())
# plot
gg1 = (ggplot(combined_df, aes(x='agebin', y='asexual_par_dens_freq', fill='densitybin')) +
geom_bar(position="stack", stat="identity") +
scale_fill_brewer(palette="BrBG") +
gg1 = (ggplot(combined_df, aes(x='agebin', y='asexual_par_dens_freq', fill='densitybin'))
+ geom_bar(position="stack", stat="identity")
+ scale_fill_brewer(palette="BrBG")
+ facet_grid('month~source')
# scale_fill_manual(values=colors, limits=names(colors)) +
facet_grid('month~source')
)

# = = = = = = = = = #
Expand All @@ -108,8 +108,8 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
combined_df0['max_gamet'] = np.nan
for rr in range(len(combined_df0.index)):
if combined_df0['source'].iloc[rr] == 'reference':
if ((combined_df0['count_asex'].iloc[rr] > 0) &
(combined_df0['count_asex'].iloc[rr] < combined_df0['bin_total_asex'].iloc[rr])):
if ((combined_df0['count_asex'].iloc[rr] > 0)
& (combined_df0['count_asex'].iloc[rr] < combined_df0['bin_total_asex'].iloc[rr])):
combined_df0['min_asex'].ilo[rr] = beta.ppf(
p=alpha / 2,
a=combined_df0['count_asex'].iloc[rr] + 0.5,
Expand All @@ -120,8 +120,8 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
a=combined_df0['count_asex'].iloc[rr] + 0.5,
b=combined_df0['bin_total_asex'].iloc[rr] - combined_df0['count_asex'].iloc[rr] + 0.5)

if ((combined_df0['count_gamet'].iloc[rr] > 0) &
(combined_df0['count_gamet'].iloc[rr] < combined_df0['bin_total_gamet'].iloc[rr])):
if ((combined_df0['count_gamet'].iloc[rr] > 0)
& (combined_df0['count_gamet'].iloc[rr] < combined_df0['bin_total_gamet'].iloc[rr])):
combined_df0['min_gamet'].iloc[rr] = beta.ppf(
p=alpha / 2,
a=combined_df0['count_gamet'].iloc[rr] + 0.5,
Expand All @@ -147,33 +147,33 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
combined_df0['agebin'] = combined_df0['agebin'].astype(age_bin_labels_cat)

# plot asexual densities
gg2 = (ggplot(combined_df0, aes(x="densitybin", y='asexual_par_dens_freq', color='source'), alpha=0.8) +
geom_line(size=2) +
geom_point() +
scale_x_continuous(trans='log10') +
geom_errorbar(aes(ymin='min_asex', ymax='max_asex'), width=0.2) +
theme_bw() +
ylab('fraction of population') +
xlab('asexual parasite density bin') +
scale_color_manual(values={"reference": 'red',
"simulation": 'blue'}) +
facet_grid('agebin~month')
gg2 = (ggplot(combined_df0, aes(x="densitybin", y='asexual_par_dens_freq', color='source'), alpha=0.8)
+ geom_line(size=2)
+ geom_point()
+ scale_x_continuous(trans='log10')
+ geom_errorbar(aes(ymin='min_asex', ymax='max_asex'), width=0.2)
+ theme_bw()
+ ylab('fraction of population')
+ xlab('asexual parasite density bin')
+ scale_color_manual(values={"reference": 'red',
"simulation": 'blue'})
+ facet_grid('agebin~month')
# scale_fill_brewer(palette = "BrBG") +
# scale_fill_manual(values=colors, limits=names(colors)) +
)

# plot gametocyte densities
gg3 = (ggplot(combined_df0, aes(x='densitybin', y='gametocyte_dens_freq', color='source')) +
geom_line(size=2) +
geom_point() +
scale_x_continuous(trans='log10') +
geom_errorbar(aes(ymin='min_gamet', ymax='max_gamet'), width=0.2) +
theme_bw() +
ylab('fraction of population') +
xlab('gametocyte density bin') +
scale_color_manual(values={"reference": 'red',
"simulation": 'blue'}) +
facet_grid('agebin~month')
gg3 = (ggplot(combined_df0, aes(x='densitybin', y='gametocyte_dens_freq', color='source'))
+ geom_line(size=2)
+ geom_point()
+ scale_x_continuous(trans='log10')
+ geom_errorbar(aes(ymin='min_gamet', ymax='max_gamet'), width=0.2)
+ theme_bw()
+ ylab('fraction of population')
+ xlab('gametocyte density bin')
+ scale_color_manual(values={"reference": 'red',
"simulation": 'blue'})
+ facet_grid('agebin~month')
)

return list(gg1, gg2, gg3)
return gg1, gg2, gg3
1 change: 0 additions & 1 deletion create_plots/helpers_coordinate_each_relationship.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ generate_age_infection_duration_outputs = function(coord_csv, simulation_output_
sim_dir = paste0(simulation_output_filepath, '/', cur_site)
sim_data = get_sim_survey(sim_dir=sim_dir, ref_df=ref_df)


# create and save comparison plots
gg1 = plot_infection_duration_dist(ref_df=ref_df, sim_data=sim_data, pos_thresh_dens=pos_thresh_dens, duration_bins=duration_bins)
gg2 = plot_infection_duration_dist_by_age(ref_df=ref_df, sim_data=sim_data, pos_thresh_dens=pos_thresh_dens, duration_bins=duration_bins)
Expand Down
Loading

0 comments on commit cbb7305

Please sign in to comment.