In [None]:
token='70994ae1-ecea-4d90-a125-1fb491ed4742'
json_str='{"studyIds": ["mixed_tianhua_DX0503_2020",  "tianhua_system_DX0503"], "pageSize": 999999, "pageIndex": 1, "attributesRangeFilters": [], "attributesEqualFilters": [], "mutationFilter": {"hugoGeneSymbols": [], "exacStart": 0, "exadEnd": 1, "vabundStart": 0, "vabundEnd": 1, "variantSource": [], "variantType": [], "variantClass": [], "sequencer": [], "sequencerSource": [], "searchStr": ""}}'
host='https://peta.bgi.com/api'

In [None]:
import pandas as pd
from pypeta import Peta
import plotly.express as px
import json
import numpy as np

In [None]:
try:
    peta=Peta(token=token,host=host)

    peta.set_data_restriction_from_json_string(json_str)

    cli=peta.fetch_clinical_data()

    mut=peta.fetch_mutation_data()
except:
    print('Failed to fetch data.')

辅助函数，将加入到pypeta包中.

In [None]:
def is_float(string:str=''):
    '''return True if a string can be converted to float'''
    try:
        float(string)
        return True
    except:
        return False
        
        

In [None]:
#该函数后续加入到pypeta中
def restrict_series_value_counts_to_designated_records(ser:pd.Series,limit:int = 20):
    '''
    组合的index用什么和列标签的命名需要考虑中英文
    '''
    length = len(ser)
    if length > limit:
        thres=limit-1
        others= pd.Series(ser[thres:].sum(),index=['Others'])
        ser = pd.concat([ser[:thres],others])
        
    df=pd.DataFrame(ser).reset_index()
    df.columns=pd.Index(['类别','数量'])
    
    return df
        
        

In [None]:
# 数组计算阳性率的函数，加入pypeta
def positive_rate(values:list,positive_tags:list):
    values=list(values)
    
    total_value_num=len(values)
    missing_value_num=values.count(np.nan)
    effective_value_num=total_value_num-missing_value_num
    positvie_event_num= sum([values.count(tag) for tag in positive_tags])
    
    positive_rate= 0 if effective_value_num == 0 else positvie_event_num/effective_value_num
        
    return (total_value_num,effective_value_num,positive_rate)
    
    
        

In [None]:
#pypeta
def mut_freq_per_gene(maf_df: pd.DataFrame,
                      cnv_df: pd.DataFrame = False,
                      sv_df: pd.DataFrame = False):
    '''使用计算每个基因在群体中的突变频率'''
    mut_df = maf_df[['Tumor_Sample_Barcode', 'Hugo_Symbol']]
    mut_df.columns = pd.Index(['Sample_ID', 'Hugo_Symbol'])

    if cnv_df:
        pass

    if sv_df:
        pass

    samples_num = len(mut_df.Sample_ID.drop_duplicates())
    if samples_num == 0:
        raise ValueError

    return mut_df.dropna().drop_duplicates().Hugo_Symbol.value_counts(
    ) / samples_num

In [None]:
#pypeta中做为peta类的方法
def filter_description(json_str:str):
    filter_dict=json.loads(json_str)
    print(f'选取的研究数据集包括',end='')
    print( *filter_dict['studyIds'],sep=',',end='')
    print('。')
    
    attributesRangeFilters = filter_dict[ 'attributesRangeFilters']
    attributesEqualFilters = filter_dict['attributesEqualFilters']
    if attributesRangeFilters or attributesEqualFilters:
        print ('样本过滤条件为',*attributesRangeFilters,*attributesEqualFilters,sep=',')

# 肿瘤个体化诊疗基因检测统计

## 样本选取条件

In [None]:
try:
    filter_description(json_str)
except:
    print('Data selected don`t support this calculation.')

## 送检样本量

In [None]:
try:
    print(f'样本总量为{len(cli)}例。')
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    vc=restrict_series_value_counts_to_designated_records(cli.CANCER_TYPE.value_counts())

    reserved_cancer_type=list(vc.iloc[:-1,0])

    print('分癌种的样本量为：')
    fig = px.pie(vc, values='数量', names='类别')
    fig.update_traces( texttemplate = "%{label}: %{value} <br>%{percent}")
    fig.update_layout(showlegend=False)
    fig.show()

except:
    print('Data selected don`t support this calculation.')

## 药物检测阳性率

In [None]:
#判断数据集是否支持药物阳性率的统计
try:
    support_for_drug_sensitivity=False
    if 'GENETIC_TEST_RESULT' in cli.columns:
        support_for_drug_sensitivity=True
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    if support_for_drug_sensitivity:
        pr=positive_rate(cli.GENETIC_TEST_RESULT,['阳性'])

        print(f'总例数为{pr[0]}，其中有效{pr[1]}例，阳性率为{pr[2]:8.2%}。')
    else:
        print("数据不支持该选项。")
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    x=cli['GENETIC_TEST_RESULT'].groupby(cli['CANCER_TYPE']).apply(lambda x: positive_rate(x,['阳性'])[2])[reserved_cancer_type].sort_values()

    xdf=pd.DataFrame(x).reset_index()


    print('各个癌种的药物阳性率为：')
    fig = px.bar(xdf, x='CANCER_TYPE', y='GENETIC_TEST_RESULT',text='GENETIC_TEST_RESULT',labels={
                         'CANCER_TYPE':'癌种',
                         'GENETIC_TEST_RESULT':'药物检出阳性率',
                     })
    fig.update_traces(texttemplate='%{text:%.2f%%}', textposition='outside',)
    fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
    fig.update_layout()
    fig.show()
except:
    print('Data selected don`t support this calculation.')

## 基因检出率 

In [None]:
try:
    mut_freq_per_gene_df=mut_freq_per_gene(mut).reset_index()

    mut_freq_per_gene_df.columns=pd.Index(['基因','频率'])
    print('各基因的检出率为：')
    fig = px.bar(mut_freq_per_gene_df[:20], x='基因', y='频率',text='频率')
    fig.update_traces(texttemplate='%{text:%.2f%%}', textposition='outside',)
    fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
    fig.show()
except:
    print('Data selected don`t support this calculation.')

## 基因突变类型检出率

In [None]:
print('暂未支持。')

## TMB分布 

In [None]:
try:
    tmb_tmp_df=cli[['CANCER_TYPE','TMB']]

    tmb_tmp_df=tmb_tmp_df.dropna(subset=['TMB'])

    tmb_tmp_df=tmb_tmp_df[tmb_tmp_df.TMB.map(lambda x: is_float(x))]

    tmb_tmp_df.loc[:,'TMB']=tmb_tmp_df.TMB.astype('float')

    tmb_describe_df=tmb_tmp_df.TMB.groupby(tmb_tmp_df.CANCER_TYPE).describe().T

    tmb_describe_df=tmb_describe_df[reserved_cancer_type]

    tmb_describe_df['全部']=tmb_tmp_df.TMB.describe()
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    print(f'存在TMB记录的样本共{len(tmb_tmp_df)}例。')
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    
    print(f'全部样本及样本量前20的分癌种样本TMB统计值如下：')
    pd.set_option('display.width',1000) 
    print(tmb_describe_df.T)
except:
    print('Data selected don`t support this calculation.')



In [None]:
try:
    total_tmp_tdf=cli.TMB[cli.TMB.map(lambda x: is_float(x))].reset_index()

    print('全部样本的TMB分布如下:')
    fig=px.histogram(total_tmp_tdf, x='TMB',labels={'x':'TMB','y':'百分比'},histnorm='probability density',marginal="rug",)
    fig.show()
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    tmb_tmp_df=tmb_tmp_df[tmb_tmp_df.CANCER_TYPE.isin(reserved_cancer_type)]

    tmb_tmp_df.columns=pd.Index(['癌症类型','TMB'])

    print('分癌种的TMB分布如下：')
    fig=px.violin(tmb_tmp_df,x='癌症类型', y="TMB", box=True, # draw box plot inside the violin
                    points='all', # can be 'outliers', or False
                   )
    fig.show()
except:
    print('Data selected don`t support this calculation.')

## MSI分布

In [None]:
try:
    msi_tmp_df=cli[['CANCER_TYPE','MSI_STATUS']]

    msi_tmp_df=msi_tmp_df.dropna(subset=['MSI_STATUS'])

    print(f'存在MSI记录的样本共{len(msi_tmp_df)}例。')
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    print(f'其分布如下：')


    fig= px.pie(msi_tmp_df.MSI_STATUS.value_counts().reset_index(), values='MSI_STATUS', names='index')
    fig.update_traces( texttemplate = "%{label}: %{value} <br>%{percent}")
    fig.update_layout(showlegend=False)
    fig.show()
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    msi_tmp_df=msi_tmp_df[msi_tmp_df.CANCER_TYPE.isin(reserved_cancer_type)]

    tmp=msi_tmp_df.MSI_STATUS.groupby(msi_tmp_df.CANCER_TYPE).value_counts().reset_index('CANCER_TYPE')

    tmp=tmp.rename(columns={'MSI_STATUS':'COUNT'})

    tmp=tmp.reset_index()

    print('分癌种的MSI分布如下：')
    fig=px.sunburst(tmp,path=['CANCER_TYPE','MSI_STATUS'],values='COUNT')
    fig.update_traces( texttemplate = "%{label}: %{value} ")
    fig.update_layout(showlegend=False)
    fig.show()
except:
    print('Data selected don`t support this calculation.')

In [None]:
try:
    msi_tmp_df.columns=pd.Index(['癌症类型','MSI状态'])

    fig = px.parallel_categories(msi_tmp_df,)
    fig.show()
except:
    print('Data selected don`t support this calculation.')