### Piecewise linear models with pystan

Based on the following tutorial from Jan Vanhove: \
https://janhove.github.io/analysis/2018/07/04/bayesian-breakpoint-model

Useful reference when going between rstan and pystan: \
https://pystan.readthedocs.io/en/latest/differences_pystan_rstan.html

In [None]:
import numpy as np
import pandas as pd
import pystan as ps
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import arviz
%matplotlib inline
import tqdm
import matplotlib
import plotly.express as px

pd.set_option('display.max_columns', None)  
# Customize matplotlib
matplotlib.rcParams['font.family'] = 'monospace'
matplotlib.rcParams['font.sans-serif'] = ['Ubuntu']

In [None]:
pd.__version__

In [None]:
# load the 10xv3 results with 30x sampling for each cell/depth combination
dfs={}
for item in glob.glob('./10xv3_final_summaries/*'):
    dfs[item.split('/')[2].split('-final_summary.csv')[0]] = pd.read_csv(item).sort_values(["sampled_cells", "total_UMIs"], ascending = (True, True))

In [None]:
stan_model = ps.StanModel(file="seqdepth_2predictors_piecewise_v3.stan", 
                          model_name = "seqdepth_2predictors_piecewise_v3")


In [None]:
stan_model

In [None]:
print(stan_model.model_code)

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)


In [None]:
dfs.keys()

In [None]:
dfs['10x_genomics_data-neuron_1k_v3']

In [None]:
results={}
results_folder='./results3/'
# for dataset in dfs:
for dataset in dfs:
    print(dataset)
    df = dfs[dataset]
    results[dataset]={}
    
    
    data_dict = {"ncells": np.log2(df["sampled_cells"]), "umis_per_cell": np.log2(df["UMIs_per_cell"]), "validation_error": np.log2(df["validation_error"]), "N": len(df)}
    stan_fit = stan_model.sampling(data=data_dict,
                           iter=20000,
                          control={'adapt_delta':1, 'max_treedepth': 40},
                                  )


    s = stan_fit.summary()
    summary = pd.DataFrame(s['summary'], columns=s['summary_colnames'], index=s['summary_rownames'])
    summary_head=pd.concat([summary.head(10),summary.iloc[-10:-1]]).copy()
    display(summary_head)

#         results[dataset][ncells]=summary
    arviz.plot_trace(stan_fit,['intercept',
                               'bp',
                               'bp_umis',
                               'before_variance',
                               'after_variance',
                                'cell_slope_difference',
                                'cell_after_over_before',
                                'cell_before_over_after',
                                'umi_slope_difference', 
                                'umi_after_over_before', 
                                'umi_before_over_after', 

                              ]
                    )
#         plt.savefig(results_folder + dataset+'-'+str(ncells)+'.png',format='png',dpi=200)


    full_stan_results = stan_fit.to_dataframe()
#         full_stan_results.to_csv(results_folder + full_stan_' + dataset+'-'+str(ncells)+'.csv')
#         summary.to_csv(results_folder + summary_stan_' + dataset+'-'+str(ncells)+'.csv')
    plt.show()

    summary_text = str(summary_head.round(3))

    extracted = stan_fit.extract()
    full_stan_results.to_csv(results_folder + 'full_stan_2predictors_' + dataset+'.csv')
    summary.to_csv(results_folder + 'summary_stan_2predictors_' + dataset+'.csv')