Merge pull request #17 from YeChen-IDM/plotting_update_2

Translate R plotting scripts into Python and add get_version.py
InstituteforDiseaseModeling · Sep 20, 2022 · cbb7305 · cbb7305
2 parents 0eee8f6 + 6a56055
commit cbb7305
Show file tree

Hide file tree

Showing 82 changed files with 2,903 additions and 47 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -28,4 +28,4 @@ jobs:
         run: pip install black flake8
 
       - name: Run linters
-        run: flake8 --ignore=E114,E115,E116,E126,E127,E128,E201,E202,E203,E222,E225,E226,E231,E241,E251,E261,E262,E265,E301,E302,E303,E306,E501,E502,W291,W292,W293,W391,W504 --exclude=./tests
+        run: flake8 --ignore=E114,E115,E116,E126,E127,E128,E201,E202,E203,E222,E225,E226,E231,E241,E251,E261,E262,E265,E301,E302,E303,E306,E501,E502,W291,W292,W293,W391,W503 --exclude=./tests
diff --git a/create_plots/Rscripts_archive/.trickgit b/create_plots/Rscripts_archive/.trickgit
diff --git a/create_plots/archive_organization/_20220603/helper_functions_age_inc_prev.py b/create_plots/archive_organization/_20220603/helper_functions_age_inc_prev.py
@@ -0,0 +1,64 @@
+from plotnine import ggplot, aes, geom_line, geom_point, theme_bw, xlab, ylab, scale_color_manual, facet_wrap
+import numpy as np
+import pandas as pd
+from pandas.api.types import CategoricalDtype
+
+
+def get_substr(site_name_str, index):
+    return site_name_str.split("_")[1][index]
+
+
+def get_mean_from_upper_age(cur_age, upper_ages):
+    if not upper_ages or cur_age not in upper_ages:
+        return None
+    else:
+        mean_ages = [upper_ages[0] / 2]
+        for i in range(len(upper_ages) - 1):
+            mean_ages.append((upper_ages[i] + upper_ages[i + 1]) / 2)
+        return mean_ages[upper_ages.index(cur_age)]
+
+
+def plot_inc_ref_sim_comparison(sim_df, ref_df):
+    """
+    Plot age-incidence comparisons with reference
+    Args:
+        sim_df ():
+        ref_df ():
+
+    Returns:
+
+    """
+    sim_df['Incidence'] = sim_df['Incidence'] * sim_df['p_detect_case']
+
+    # set up reference and simulation dataset columns
+    ref_df['Incidence'] = ref_df['INC'] / 1000
+    ref_df['mean_age'] = (ref_df['INC_LAR'] + ref_df['INC_UAR']) / 2
+    ref_df = pd.DataFrame(data={'Incidence': ref_df['Incidence'],
+                                'mean_age': ref_df['mean_age'],
+                                'Site': ref_df['Site'],
+                                'Pop_size': ref_df['POP'],
+                                'year': ref_df['START_YEAR']})
+    sim_df = pd.DataFrame(data={'Incidence': sim_df['Incidence'],
+                                'mean_age': sim_df['mean_age'],
+                                'Site': sim_df['Site'],
+                                'Pop_size': np.nan,
+                                'year': np.nan})
+
+    ref_df['source'] = 'reference'
+    sim_df['source'] = 'simulation'
+
+    df_combined = pd.concat(sim_df, ref_df)
+
+    source_cat = CategoricalDtype(categories=['reference', 'simulation'], ordered=True)
+    df_combined['source'] = df_combined['source'].astype(source_cat)
+
+    gg = (ggplot(df_combined, aes(x='mean_age', y='Incidence', color='source', group='year'))
+          + geom_line()
+          + geom_point()
+          + scale_color_manual(values={"reference": 'red',
+                                       "simulation": 'blue'})
+          + xlab('age (midpoint of age bin)')
+          + ylab('incidence')
+          + facet_wrap('Site', ncol=4)
+          + theme_bw())
+    return gg
diff --git a/create_plots/archive_organization/_20220603/helper_functions_infection_duration.py b/create_plots/archive_organization/_20220603/helper_functions_infection_duration.py
@@ -0,0 +1,144 @@
+#####################################################################################
+# compare_ref_sim_infection_duration_Navrongo
+# contact: mambrose
+# Jan 2022
+#
+# Goal: simulate transmission in EMOD and sample to match reference dataset
+#   Compare:
+#   - probability a person goes from negative to positive between sample dates
+#   - probability a person goes from positive to negative between sample dates
+#   - fraction of samples positive
+#   - among all individuals starting positive, how long until negative?
+#   - among all individuals turning positive during study, how long until negative?
+#####################################################################################
+
+import numpy as np
+import pandas as pd
+import os.path as path
+import datetime
+import math
+import random
+
+
+def get_sim_survey(sim_dir, ref_df, seeds=np.nan):
+    """
+    Get subset of simulation dataset to match reference dataset for dates and ages of individuals of sampled individuals
+    Args:
+        sim_dir ():
+        ref_df ():
+        seeds ():
+
+    Returns:
+
+    """
+    sampled_sim_filename = 'sim_duration_survey_sampling.csv'
+    file_path = path.join(sim_dir, sampled_sim_filename)
+    if path.isfile(file_path):
+        sim_subset_full = pd.read_csv(file_path)
+    else:
+        # Get first year of sampling in reference dataset. the simulation will be referenced from the first day of
+        # that year
+        first_ref_date = datetime.date(datetime.datetime.strptime(ref_df['date'].dropna().min(), "%Y-%m-%d").year, 1, 1)
+        indIDs = ref_df['SID'].unique()
+
+        patient_report_path = path.join(sim_dir, 'patient_reports.csv')
+
+        sim_full = pd.read_csv(patient_report_path)
+        sim_full['date'] = first_ref_date + sim_full['simday']
+        sim_full['age'] = sim_full['age'] / 365
+
+        if math.isnan(seeds):
+            seeds = sim_full['Run_Number'].unique()
+
+        for seed in sorted(seeds):
+            print('Currently on seed ' + seed)
+            sim = sim_full[sim_full['Run_Number'] == seed]  # subset to desired run
+            # track which individuals have already been included from the simulation (
+            # to avoid double-sampling simulation individuals)
+            included_ids = list()
+            sim_subset = pd.DataFrame()
+            for ii in range(len(indIDs)):
+                if ii % 50 == 0:
+                    print('Currently on individual ' + ii + ' out of ', len(indIDs))
+                ref_df_cur = ref_df[ref_df['SID'] == indIDs[ii]]
+                ref_df_cur = ref_df_cur.sort_values(by='date')
+                # find a matching individual
+                age_cur = ref_df_cur['age'].iloc[0]
+                day_cur = ref_df_cur['date'].iloc[0]
+
+                # use age-specific matches
+                id_candidates = sim[(sim['date'] == day_cur) & (round(sim['age'] == round(age_cur)))]['id']
+                id_candidates = [idx not in included_ids for idx in id_candidates]
+                # if no perfect age-match remain, expand year-range until finding a match
+                if len(id_candidates) == 0:
+                    year_range = 0
+                    while len(id_candidates) == 0 & year_range < 100:
+                        year_range = year_range + 5
+                        id_candidates = sim[(sim['date'] == day_cur)
+                                            & (sim['age'].round().isin(range((round(age_cur)-year_range),
+                                                                             round(age_cur)+year_range)))]['id']
+                        id_candidates = [idx not in included_ids for idx in id_candidates]
+
+                    if len(id_candidates) == 0:
+                        print('Problem: no age-matched simulation individual found for reference id: ' + indIDs[ii])
+                    else:
+                        print('No exact age match remaining for reference id: ' + indIDs[ii]
+                              + '. Used simulation individual within ', year_range, ' years.')
+
+                id_sim_cur = random.sample(id_candidates, 1) # todo: should we remove this id after drawing?
+                included_ids.extend(id_sim_cur)
+
+                # keep the same simulation dates as the reference samples for this individual
+                sim_subset_cur = sim[(sim['id'] == id_sim_cur) & (sim['date'].isin(ref_df_cur['date']))]
+                sim_subset = pd.concat(sim_subset, sim_subset_cur)
+
+            sim_subset['seed'] = seed
+            if seed == sorted(seeds)[1]:
+                sim_subset_full = sim_subset
+            else:
+                sim_subset_full = pd.concat(sim_subset_full, sim_subset)
+
+        # rename simulation columns to match reference data
+        sim_subset_full.rename(columns={'id': 'SID', 'true_asexual_parasites': 'DENSITY'}, inplace=True)
+        sim_subset_full.to_csv(file_path, index=False)
+
+    return sim_subset_full
+
+
+# def get_frac_state_swaps(data):
+#     """
+#     Calculate probability of going from negative to positive or from positive to negative between sample dates
+#     Args:
+#         data ():
+#
+#     Returns:
+#
+#     """
+#     # brute force approach iterating through people and dates
+#     indIDs = data['SID'].unique()
+#     sum_denom_pos = 0
+#     sum_turn_neg = 0
+#     sum_denom_neg = 0
+#     sum_turn_pos = 0
+#
+#     for ii in range(len(indIDs)):
+#         data_cur = data[data['SID'] == indIDs[ii]]
+#         data_cur = data_cur.sort_values(by=['date'])
+#         # get indices of positive tests and of negative tests
+#         ind_pos = which(data_cur$DENSITY > pos_thresh_dens)
+#         ind_neg = which(data_cur$DENSITY <= pos_thresh_dens)
+#
+#         # denominators for each (number of each type that had an observation after then (i.e., they could have been observed to change))
+#         last_obs_pos = (data_cur$DENSITY[nrow(data_cur)] > pos_thresh_dens)
+#         sum_denom_pos = sum_denom_pos + ifelse(last_obs_pos, length(ind_pos)-1, length(ind_pos))
+#         sum_denom_neg = sum_denom_neg + ifelse(last_obs_pos, length(ind_neg), length(ind_neg)-1)
+#
+#         # find how many tests change from neg to pos or pos to neg across timesteps
+#         sum_turn_neg = sum_turn_neg + sum((ind_pos + 1) % in % ind_neg)
+#         sum_turn_pos = sum_turn_pos + sum((ind_neg + 1) % in % ind_pos)
+#
+#
+#     frac_pos_turn_neg_next_time = sum_turn_neg / sum_denom_pos
+#     frac_neg_turn_pos_next_time = sum_turn_pos / sum_denom_neg
+#
+#     return (c(frac_pos_turn_neg_next_time, frac_neg_turn_pos_next_time))
diff --git a/create_plots/helper_functions_par_dens.py → ...on/_20220603/helper_functions_par_dens.py b/create_plots/helper_functions_par_dens.py → ...on/_20220603/helper_functions_par_dens.py
@@ -88,11 +88,11 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
     # colors = brewer.pal(n=num_colors, name='BrBG')
     # names(colors) = sorted(combined_df['densitybin'].unique())
     # plot
-    gg1 = (ggplot(combined_df, aes(x='agebin', y='asexual_par_dens_freq', fill='densitybin')) +
-           geom_bar(position="stack", stat="identity") +
-           scale_fill_brewer(palette="BrBG") +
+    gg1 = (ggplot(combined_df, aes(x='agebin', y='asexual_par_dens_freq', fill='densitybin'))
+           + geom_bar(position="stack", stat="identity")
+           + scale_fill_brewer(palette="BrBG")
+           + facet_grid('month~source')
            # scale_fill_manual(values=colors, limits=names(colors)) +
-           facet_grid('month~source')
            )
 
     # = = = = = = = = = #
@@ -108,8 +108,8 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
     combined_df0['max_gamet'] = np.nan
     for rr in range(len(combined_df0.index)):
         if combined_df0['source'].iloc[rr] == 'reference':
-            if ((combined_df0['count_asex'].iloc[rr] > 0) &
-                    (combined_df0['count_asex'].iloc[rr] < combined_df0['bin_total_asex'].iloc[rr])):
+            if ((combined_df0['count_asex'].iloc[rr] > 0)
+                    & (combined_df0['count_asex'].iloc[rr] < combined_df0['bin_total_asex'].iloc[rr])):
                 combined_df0['min_asex'].ilo[rr] = beta.ppf(
                     p=alpha / 2,
                     a=combined_df0['count_asex'].iloc[rr] + 0.5,
@@ -120,8 +120,8 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
                     a=combined_df0['count_asex'].iloc[rr] + 0.5,
                     b=combined_df0['bin_total_asex'].iloc[rr] - combined_df0['count_asex'].iloc[rr] + 0.5)
 
-            if ((combined_df0['count_gamet'].iloc[rr] > 0) &
-                    (combined_df0['count_gamet'].iloc[rr] < combined_df0['bin_total_gamet'].iloc[rr])):
+            if ((combined_df0['count_gamet'].iloc[rr] > 0)
+                    & (combined_df0['count_gamet'].iloc[rr] < combined_df0['bin_total_gamet'].iloc[rr])):
                 combined_df0['min_gamet'].iloc[rr] = beta.ppf(
                     p=alpha / 2,
                     a=combined_df0['count_gamet'].iloc[rr] + 0.5,
@@ -147,33 +147,33 @@ def plot_par_dens_ref_sim_comparison(age_agg_sim_df, ref_df):
     combined_df0['agebin'] = combined_df0['agebin'].astype(age_bin_labels_cat)
 
     # plot asexual densities
-    gg2 = (ggplot(combined_df0, aes(x="densitybin", y='asexual_par_dens_freq', color='source'), alpha=0.8) +
-           geom_line(size=2) +
-           geom_point() +
-           scale_x_continuous(trans='log10') +
-           geom_errorbar(aes(ymin='min_asex', ymax='max_asex'), width=0.2) +
-           theme_bw() +
-           ylab('fraction of population') +
-           xlab('asexual parasite density bin') +
-           scale_color_manual(values={"reference": 'red',
-                                        "simulation": 'blue'}) +
-           facet_grid('agebin~month')
+    gg2 = (ggplot(combined_df0, aes(x="densitybin", y='asexual_par_dens_freq', color='source'), alpha=0.8)
+           + geom_line(size=2)
+           + geom_point()
+           + scale_x_continuous(trans='log10')
+           + geom_errorbar(aes(ymin='min_asex', ymax='max_asex'), width=0.2)
+           + theme_bw()
+           + ylab('fraction of population')
+           + xlab('asexual parasite density bin')
+           + scale_color_manual(values={"reference": 'red',
+                                        "simulation": 'blue'})
+           + facet_grid('agebin~month')
            # scale_fill_brewer(palette = "BrBG") +
            # scale_fill_manual(values=colors, limits=names(colors)) +
            )
 
     # plot gametocyte densities
-    gg3 = (ggplot(combined_df0, aes(x='densitybin', y='gametocyte_dens_freq', color='source')) +
-           geom_line(size=2) +
-           geom_point() +
-           scale_x_continuous(trans='log10') +
-           geom_errorbar(aes(ymin='min_gamet', ymax='max_gamet'), width=0.2) +
-           theme_bw() +
-           ylab('fraction of population') +
-           xlab('gametocyte density bin') +
-           scale_color_manual(values={"reference": 'red',
-                                        "simulation": 'blue'}) +
-           facet_grid('agebin~month')
+    gg3 = (ggplot(combined_df0, aes(x='densitybin', y='gametocyte_dens_freq', color='source'))
+           + geom_line(size=2)
+           + geom_point()
+           + scale_x_continuous(trans='log10')
+           + geom_errorbar(aes(ymin='min_gamet', ymax='max_gamet'), width=0.2)
+           + theme_bw()
+           + ylab('fraction of population')
+           + xlab('gametocyte density bin')
+           + scale_color_manual(values={"reference": 'red',
+                                        "simulation": 'blue'})
+           + facet_grid('agebin~month')
            )
 
-    return list(gg1, gg2, gg3)
+    return gg1, gg2, gg3
diff --git a/create_plots/helpers_coordinate_each_relationship.R b/create_plots/helpers_coordinate_each_relationship.R
@@ -283,7 +283,6 @@ generate_age_infection_duration_outputs = function(coord_csv, simulation_output_
     sim_dir = paste0(simulation_output_filepath, '/', cur_site)
     sim_data = get_sim_survey(sim_dir=sim_dir, ref_df=ref_df)
 
-
     # create and save comparison plots
     gg1 = plot_infection_duration_dist(ref_df=ref_df, sim_data=sim_data, pos_thresh_dens=pos_thresh_dens, duration_bins=duration_bins)
     gg2 = plot_infection_duration_dist_by_age(ref_df=ref_df, sim_data=sim_data, pos_thresh_dens=pos_thresh_dens, duration_bins=duration_bins)