In [2]:
import numpy as np
import pandas as pd
import os
import sys

from matplotlib import pyplot as plt
import seaborn as sns
import venn

In [16]:
# create dir for figures, and load data
fig_dir="./Figures"
os.makedirs(fig_dir,exist_ok=True)
search_data = pd.read_csv("../data/search_data.csv") # load search data
registry_data = pd.read_csv("../data/registry_data.csv") # load registry data (not public avaiable due to data protection protocol)

In [17]:
# Merge the search and registry data
year_data = search_data.merge(registry_data,on=['disease','year'],how='outer')

In [32]:
# statistics

# overall statistics
display(year_data.groupby('year').agg({'search_estimated_patients':'sum','registry_patients':'sum'}))

# Calculate and display the diseases with search patients > registry patients for all 4 years and vice versa
Search_more = year_data.loc[year_data.registry_patients<year_data.search_estimated_patients].disease.value_counts()
Search_more_disease = Search_more.loc[Search_more==4].index
Search_less = year_data.loc[year_data.registry_patients>year_data.search_estimated_patients].disease.value_counts()
Search_less_disease = Search_less.loc[Search_less==4].index
print("Search more than registry: %d, Registry more than search: %d"%(len(Search_more_disease),
                                                                      len(Search_less_disease)))

Unnamed: 0_level_0,search_estimated_patients,registry_patients
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,103438,28610
2017,128521,34360
2018,117431,41993
2019,138002,48264


Search more than registry: 46, Registry more than search: 37


In [39]:
# Top 10 ranking diseases for search and registry data, respectively
display(year_data.loc[year_data.search_rank<=10].sort_values(by=['year','search_rank'])[['disease','search_rank']])
display(year_data.loc[year_data.registry_rank<=10].sort_values(by=['year','registry_rank'])[['disease','registry_rank']])

Unnamed: 0,disease,search_rank
0,白化病,1
1,成骨不全症（脆骨病）,2
2,血友病,3
3,全身型重症肌无力,4
4,肌萎缩侧索硬化,5
5,系统性硬化症,6
6,苯丙酮尿症,7
7,多发性硬化,8
8,肝豆状核变性,9
9,视网膜色素变性,10


Unnamed: 0,disease,registry_rank
2,血友病,1
21,视神经脊髓炎,2
7,多发性硬化,3
5,系统性硬化症,4
25,多系统萎缩,5
3,全身型重症肌无力,6
8,肝豆状核变性,7
17,特发性肺动脉高压,8
19,自身免疫性脑炎,9
4,肌萎缩侧索硬化,10


In [47]:
# group registry and search rank into groups (20 diseases a group)
year_data["registry_rank_group"] = (year_data["registry_rank"]/20).astype(int)
year_data.loc[year_data.registry_rank_group==6,"registry_rank_group"]=5
year_data["search_rank_group"] = (year_data["search_rank"]/20).astype(int)
year_data.loc[year_data.search_rank_group==6,"search_rank_group"]=5

In [48]:
from scipy import stats
for i in range(2016,2020):
    a = year_data.loc[year_data.year==i].groupby("search_rank_group").registry_rank.mean().reset_index().sort_values(
        by="search_rank_group").registry_rank.tolist()
    print(stats.pearsonr(range(len(a)),a))

(0.9569475230416531, 0.002740374435835146)
(0.9538992244805171, 0.003138933692492399)
(0.9818834172185176, 0.0004893428305216745)
(0.9807297113574754, 0.0005534380832817096)


In [None]:
plt.rcParams['font.sans-serif'] = 'times new roman' 
plt.rcParams['axes.unicode_minus'] = False 
for i in range(2016,2020):
    plt.figure(figsize=(10,10))
    ax = plt.subplot()
    a = year_data.loc[year_data.year==i].groupby("search_20level").confirm_rank.mean().reset_index().sort_values(
        by="search_20level").confirm_rank.tolist()
    ax.plot(a,marker='*',color='black')
    ax.set_title("Year %d"%(i),fontsize=26,y=0.95)
    x_labels = []
    for j in range(6):
        x_labels.append("%d-%d"%(j*20+1,j*20+20 if j<5 else 120))
    ax.set_xticks(range(6))
    ax.set_xticklabels(x_labels,fontsize=18)
    ax.set_xlabel("Search Ranking",fontsize=24)
    ax.set_ylabel("Average Registry Ranking",fontsize=24)
    ax.set_ylim(0,125)
    ax.set_yticks(np.arange(0,130,20))
    ax.set_yticklabels(np.arange(0,130,20).round(0),fontsize=16)
    plt.savefig(os.path.join(fig_dir,"Ranking_20level_year%d_searchx.png"%(i)))