# GRPM MeSH Screening 

This notebook is engineered to screen the previously retrieved genetic polymorphism data using selected MeSH terms. It works with MeSH sets that are used as hooks to retrieve subsets of genes and polymorphisms from the "GRPM ds" dataset.

In [None]:
#Only for Google Colab
import os
import sys

# @markdown Run in Colab virtual machine by default

# @markdown to run in google drive set:
import_mydrive = False #@param {type:"boolean"}

if 'google.colab' in sys.modules:
    if import_mydrive:
        from google.colab import drive
        drive.mount('/content/drive')
        if os.path.exists('/content/drive/MyDrive/grpm_system/'):
            %cd /content/drive/MyDrive/grpm_system/
        else:
            %mkdir /content/drive/MyDrive/grpm_system/
            %cd /content/drive/MyDrive/grpm_system/
    else:
        if os.path.exists('/content/grpm_system/'):
            %cd /content/grpm_system/
        else:
            %mkdir /content/grpm_system/
            %cd /content/grpm_system/

current_directory = os.getcwd()
print("Current working directory:", current_directory)

# Import Packages

In [113]:
import os
import io
import glob
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import requests
import zipfile

def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

def get_and_extract(file, dir = os.getcwd()):
    url = "https://zenodo.org/record/8205724/files/"+file+".zip?download=1"
    zip_file_name = file+".zip"
    extracted_folder_name = dir

    # Download the ZIP file
    response = requests.get(url)

    if response.status_code == 200:
        # Extract the ZIP contents
        with io.BytesIO(response.content) as zip_buffer:
            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_name)
        print(f"ZIP file '{zip_file_name}' extracted to '{extracted_folder_name}' successfully.")
    else:
        print("Failed to download the ZIP file.")

# Get requirements

In [None]:
# Get GRPM Dataset from Zenodo Repository
#https://zenodo.org/record/8205724  DOI: 10.5281/zenodo.8205724

if simple_bool('Download pre-made GRPM-Dataset from Zenodo? (6.5 minutes in Colab)'):
    timea = datetime.now()
    get_and_extract('grpm_dataset')
    print('Download and extraction time ',datetime.now()-timea)

if simple_bool('Download pre-made ref-mesh-archive from Zenodo?'):
    timea = datetime.now()
    get_and_extract('ref-mesh-archive')
    print('Download and extraction time ',datetime.now()-timea)

## Import GRPM dataset (required)

In [114]:
#Load GRPM db Report-----------------------------------------

# choose database:
db_tag      = 'pcg'
# 'pcg'    = protein coding genes = grpm_db
# 'rna'    = rna genes            = grpm_db_rna
# 'pseudo' = pseudogenes          = grpm_db_pseudo

db_name = 'grpm_db_'+ db_tag
db_path = 'grpm_dataset/'+db_name

print('importing GRPM Dataset...')
#get gene list from grpm report
GRPM_report = pd.read_csv(db_path+'/GRPM_report.csv',index_col=0).transpose().reset_index().rename(columns={'index':'gene'})
grpm_genes_list = GRPM_report.gene.to_list()

#Import grpm data back-------------------------------------------
time_load_1 = datetime.now()

columns = ['gene', 'rsid', 'pmids', 'mesh']
dummy = pd.read_csv(db_path+'/grpm_table_output.csv', usecols=columns)

dummy['pmids'] = dummy['pmids'].astype(str) #convert pmid type in str
time_load_2 = datetime.now()
print('time load:',time_load_2-time_load_1)

importing GRPM Dataset...
time load: 0:00:17.575024


## Subset GRPM Dataset (optional)

In [124]:
subset_grpm = simple_bool('Do you want to use a custom gene list to subset GRPM Dataset?')
if subset_grpm:
    # import your custom gene list (.csv)
    file_csv = []
    for file in os.listdir():
        if file.endswith(".csv") or file.endswith(".tsv"):
            file_csv.append(file)

    filenum = input('import your custom gene list (.csv)\nselect file index: \n'+str(pd.Series(file_csv)))

    time1 = datetime.now()
    subset_genes = pd.read_csv(file_csv[int(filenum)])
    subset_genes = subset_genes[subset_genes.columns[int(input('select column index:\n'+ str(pd.Series(subset_genes.columns))))]].drop_duplicates().str.replace(' ','')
    subset_genes.to_list()

    # subsetting GRPM_report and dummy
    GRPM_report_subset = GRPM_report[GRPM_report['gene'].isin(subset_genes)]
    dummy_subset = dummy[dummy['gene'].isin(subset_genes)]
    print("You're using a GRPM Dataset partition\ntime subsetting:",time_load_2-time_load_1)
    display(GRPM_report_subset)

You're using a GRPM Dataset partition
time subsetting: 0:00:17.575024


Unnamed: 0,gene,ncbi_dbsnp,lit2_variant,lit2_variant_norsid,lit2_rsid,lit2_rsid_plus1,lit1_rsid,lit1_rsid_pmid_plus1,lit1_pmid,lit1_pmid_pmid_plus1,...,pubmed_mesh_qualifier_major,pubmed_mesh,rsid_pmid10,rsid_pmid50,rsid_pmid100,top10mesh_all,top10rsid_all,pubmed_runtime,total_runtime,time_stamp
614,PIGW,3129,110,16,90,29,6,3,5,3,...,62,48,0,0,0,"['Humans', 'Abnormalities, Multiple', 'Membran...","['rs1256773607', 'rs200024253', 'rs587777733',...",0:00:01,0:00:03,2023-03-14 12:21:23.181294
732,B3GALNT2,26349,239,12,216,109,13,6,10,8,...,100,76,0,0,0,"['Humans', 'Female', 'Polymorphism, Single Nuc...","['rs367543069', 'rs367543072', 'rs367543076', ...",0:00:01,0:00:03,2023-03-14 12:31:56.596504
968,FUT8,144817,527,36,485,262,17,3,16,8,...,184,120,0,0,0,"['Humans', 'Female', 'Fucosyltransferases', 'M...","['rs10483776', 'rs1297536872', 'rs1334593208',...",0:00:01,0:00:05,2023-03-14 11:18:23.197187
1139,ALG12,27161,703,6,687,315,19,8,20,13,...,212,146,0,0,0,"['Humans', 'Female', 'Male', 'Mannosyltransfer...","['rs1321', 'rs755892540', 'rs121907932', 'rs12...",0:00:01,0:00:05,2023-03-14 11:35:02.311363
1152,PIGN,62526,715,90,599,253,33,10,31,19,...,230,162,0,0,0,"['Humans', 'Female', 'Male', 'Phosphotransfera...","['rs587777186', 'rs587777187', 'rs397514475', ...",0:00:01,0:00:06,2023-03-14 11:36:09.997035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14670,POMGNT2,10366,173,13,155,60,6,1,7,4,...,77,56,0,0,0,"['Humans', 'Mutation', 'Glycosyltransferases',...","['rs387907300', 'rs387907299', 'rs488069', 'rs...",0:00:02,0:00:05,2023-03-22 10:15:08.009351
14741,PIGS,7166,60,7,52,18,3,1,4,2,...,62,49,0,0,0,"['Animals', 'Humans', 'Abnormalities, Multiple...","['rs114331597', 'rs1426262136', 'rs34669811']",0:00:01,0:00:04,2023-03-22 10:25:28.607256
15009,ALG6,26871,256,4,238,91,18,7,35,30,...,274,173,1,0,0,"['Humans', 'Congenital Disorders of Glycosylat...","['rs121908443', 'rs35383149', 'rs4630153', 'rs...",0:00:02,0:00:05,2023-03-22 11:35:48.006586
15050,B3GALT6,2796,107,6,85,43,20,11,11,8,...,111,75,0,0,0,"['Humans', 'Mutation', 'Child', 'Joint Instabi...","['rs397514717', 'rs397514720', 'rs397514721', ...",0:00:02,0:00:04,2023-03-22 11:40:18.574321


# Define context
    - gene list
    - survey directory
    - ref-mesh list

## Check avalable ref-MeSH lists

In [None]:
#Check avalable refs:
ref_path = "ref-mesh-archive/"  # Replace with the actual ref mesh path

#---------------------------------
#use random mesh list?
random_mesh = False
if random_mesh:
    ref_path = "ref-mesh-archive/random_lists/"
#---------------------------------

# Create a file path pattern to match CSV files
file_pattern = os.path.join(ref_path, "*.csv")

# Use glob to get a list of file paths matching the pattern
csv_files = glob.glob(file_pattern)

csv_files_name = []
# Print the list of CSV files
for file in csv_files:
    file_name = os.path.basename(file)
    csv_files_name.append(file_name)

pd.set_option('display.max_rows', 100)
print('Available reference mesh lists:')
csv_files_df = pd.Series(csv_files_name)

csv_file_tag = pd.DataFrame()
if not random_mesh:
    csv_file_tag = csv_files_df.str.extract(r'ref_mesh_(.*)\.csv', expand=False).dropna().reset_index(drop=True)
else:
    csv_file_tag = csv_files_df.str.extract(r'(.*)\.csv', expand=False).dropna().reset_index(drop=True)

csv_file_tag

## Set directory/import data

In [112]:
#------------------------------------------------------
# define directory folder path:
survey_path = 'grpm_surveys/' # keep default to use root path

# choose ref_mesh.csv tab:
topic_tag   = csv_file_tag[int(input('\Select index from available ref-mesh list:\n'+str(csv_file_tag)))]
add         = ''    # additional survey directory tag

#------------------------------------------------------

# (1) Create survey directory:
survey_path = survey_path+'grpm_random/' if random_mesh else survey_path
directory = survey_path + 'grpm_survey_' + db_tag + '_' + topic_tag + add
if not os.path.exists(directory):
    os.makedirs(directory)

# (2) Import Mesh-reference list:
ref_filename = "ref_mesh_" + topic_tag + ".csv" if not random_mesh else topic_tag + ".csv"
ref = pd.read_csv(ref_path + ref_filename, index_col=0)

if 'mesh' not in ref.columns:
    ref = ref.rename(columns={'Preferred Label': 'mesh'})

ref_mesh_n = ref.mesh.nunique()
ref_mesh_list = ref['mesh'].drop_duplicates()

# (3) Load saved checkpoint data or initialize dataframes:
if os.path.isfile(directory+'/grpmx_filtered_output.csv'):
    complete_df = pd.read_csv(directory+'/grpmx_filtered_output.csv',index_col=0)
else:
    complete_df = pd.DataFrame()

if os.path.isfile(directory+'/GRPMX_report.csv'):
    df_report_complete = pd.read_csv(directory+'/GRPMX_report.csv',index_col=0)
    restart = True
else:
    df_report_complete = pd.DataFrame()
    restart = False
#----------------------------------------------------------
print('\n', ref_mesh_list)


 0          Fatigue Syndrome, Chronic
1                 Migraine Disorders
2              Tension-Type Headache
3                 Headache Disorders
4                   Cluster Headache
                   ...              
396                      Anaphylaxis
400    Passive Cutaneous Anaphylaxis
401         Irritable Bowel Syndrome
402               Nutritional Status
403                     Diet Therapy
Name: mesh, Length: 145, dtype: object


In [None]:
# check checkpoint report:
df_report_complete.T

# Run Survey

In [None]:
#---------------------------------------------
# Edit saving options:
save_plot = False
checkpoint = 200 #save data each x genes

run_sample = False # set True just to run a test
num_sample = 10

partial_job = False # set True to set job end point
partial_job_finish = 10

exclude_top10 = False # for a faster job
#---------------------------------------------

time_start = datetime.now()

if restart:
    restart_from = len(df_report_complete.T)
    gene_start = restart_from
    print('search restarted from '+str(restart_from))
else:
    gene_start = 0

# Ddefine grpm subset
if subset_grpm:
    GRPM_report = GRPM_report_subset
    dummy = dummy_subset

# define gene list
import random
if run_sample:
    genes = random.sample(grpm_genes_list[:], num_sample)
else:
    if partial_job:
        genes = grpm_genes_list[gene_start: gene_start + partial_job_finish]
    else:
        genes = grpm_genes_list[gene_start:len(grpm_genes_list)]

for gene in genes:

    time_alpha = datetime.now()
    timestamp = time_alpha.strftime('%Y%m%d%H%M%S')

    if gene in dummy.gene.drop_duplicates().to_list():

        dummy_gene = dummy.loc[dummy['gene'] == gene]
        rsidpmid = dummy_gene[['rsid','pmids']].drop_duplicates().reset_index(drop=True)
        pmidmesh = dummy_gene[['pmids','mesh']].drop_duplicates()
        #dfmesh = dummy_gene[['pmids', 'mesh', 'qualifier', 'major']].drop_duplicates().reset_index(drop=True)

        #Filter pmid for rsid with pmid>1 [Deprecated]
        #   rsidpmid_count = rsidpmid.groupby('rsid').describe().reset_index()
        #   rsidpmid_count.columns = rsidpmid_count.columns.to_flat_index()
        #   new_column_names = ['rsid', 'pmid_count', 'pmid_unique','pmid_top','pmid_freq']
        #   rsidpmid_count.columns = new_column_names
        #   outless = rsidpmid_count[rsidpmid_count.pmid_unique>1]
        #   mask = rsidpmid['rsid'].isin(outless.rsid)
        #   rsidpmid_less = rsidpmid[mask]


        # Correlation on "pmidmesh.mesh"------------------------
        mask = pmidmesh['mesh'].isin(ref_mesh_list)
        dfmatch = pmidmesh[mask]
        mask_full = dummy_gene['mesh'].isin(ref_mesh_list)
        dfmatch_full = dummy_gene[mask_full]
        #report statistics:
        pmidmesh_before  =  pmidmesh.nunique()
        pmidmesh_after   =  dfmatch.nunique()
        interesting_pmid =  dfmatch.nunique()

        #pmidmask = rsidpmid['pmids'].isin(dfmatch.pmids) #mymask
        #rsidlast = rsidpmid[pmidmask]  # mask on rsidpmid
        #rsidlastlist = rsidlast.rsid.drop_duplicates()

        #report statistics
        lit1_rsid         = dummy_gene.rsid.nunique()
        #lit1_pmid_f      = rsidpmid_less.pmids.nunique()
        matching_rsid     = dfmatch_full['rsid'].nunique()
        dropped_rsid      = lit1_rsid - dfmatch_full['rsid'].nunique()
        starting_pmid     = pmidmesh['pmids'].nunique()
        starting_mesh     = pmidmesh['mesh'].nunique()
        starting_pmidmesh = len(pmidmesh)
        matching_pmids    = dfmatch.pmids.nunique()
        matching_mesh     = dfmatch.mesh.nunique()
        matching_pmidmesh = len(dfmatch)


        dfmatch_less_ = dfmatch_full[['pmids', 'rsid', 'mesh']].drop_duplicates()
        #interesting_rsid = dfmatch_less_.rsid.nunique()

        #------------------------
        if exclude_top10:
            matching_rsid_pmid10  = 'missing'
            matching_rsid_pmid100 = 'missing'
            top10rsid             = 'missing'
            top10mesh             = 'missing'
        else:
            #Analyze enrichment with "groupby.describe" method

            ## 1. groupby.describe analysis by [rsid]
            dfmatch_less_rsid = dfmatch_less_.groupby('rsid').describe().reset_index()
            dfmatch_less_rsid.columns = dfmatch_less_rsid.columns.to_flat_index()
            new_column_names = ['rsid', 'pmid-count', 'pmid-unique','pmid-top','pmid-freq','mesh-count', 'mesh-unique','mesh-top','mesh-freq']
            dfmatch_less_rsid.columns = new_column_names

            ### statistics:
            matching_rsid_pmid10 = len(dfmatch_less_rsid[dfmatch_less_rsid['pmid-unique']>10])
            matching_rsid_pmid100 = len(dfmatch_less_rsid[dfmatch_less_rsid['pmid-unique']>100])

            ### sorting, top10
            dfmatch_less_rsidless = dfmatch_less_rsid[['rsid','pmid-unique','mesh-unique']]
            dfmatch_less_rsidlesssort = dfmatch_less_rsidless.sort_values(by='pmid-unique', ascending= False).reset_index(drop=True)
            top10rsid = dfmatch_less_rsidlesssort['rsid'][:10].tolist()
            #------------------

            ## 2. groupby.describe analysis by [mesh]
            dfmatch_less_mesh = dfmatch_less_.groupby('mesh').describe().reset_index()
            dfmatch_less_mesh.columns = dfmatch_less_mesh.columns.to_flat_index()
            #to handle generate df.groupby.describe, convert Multicolumn to single column
            #https://datascientyst.com/flatten-multiindex-in-pandas/
            new_column_names = ['mesh', 'pmid-count', 'pmid-unique','pmid-top','pmid-freq','rsid-count', 'rsid-unique','rsid-top','rsid-freq']
            dfmatch_less_mesh.columns = new_column_names

            dfmatch_less_mesh_less = dfmatch_less_mesh[['mesh','pmid-unique','rsid-unique']]
            #dfmatch_less_mesh_lesssort = dfmatch_less_mesh_less.sort_values(by='pmid-unique',ascending=False).reset_index(drop=True)

            ### add frequency, top10
            samplepmid_count = len(dfmatch.pmids.drop_duplicates())
            dfmatch_less_mesh_less_frq = dfmatch_less_mesh_less.copy()
            mesh_frq = dfmatch_less_mesh_less_frq.loc[:,'pmid-unique'].astype(float)/samplepmid_count
            dfmatch_less_mesh_less_frq.loc[:,'mesh frequency'] = round(mesh_frq,3)#*100
            dfmatch_less_mesh_less_frqsort = dfmatch_less_mesh_less_frq.sort_values(by='pmid-unique',ascending=False).reset_index(drop=True)
            top10mesh = dfmatch_less_mesh_less_frqsort['mesh'][:10].tolist()
        #------------------

        if save_plot:
            # create a scatter plot
            x = dfmatch_less_mesh_less_frqsort['mesh'].head(30)
            y = dfmatch_less_mesh_less_frqsort['pmid-unique'].head(30)
            plt.figure(figsize=(5, 8))
            plt.title('Scatter Plot: '+gene+' pmid-mesh (filtered)', loc='center',pad=10)
            plt.scatter(y, x)
            plt.gca().invert_yaxis()
            #plt.subplots_adjust(left=0.3, right=0.9, bottom=0.3, top=0.9)
            #plt.xticks(rotation=90)
            plt.tick_params(axis='x', which='both', top=True, bottom=False, labeltop=True, labelbottom=False)
            plt.xlabel('pmid count', position=(0.5, 1.08))
            ax = plt.gca()
            ax.xaxis.set_label_position('top')
            #plt.show()
            plt.savefig(directory+'/'+gene+'_mesh_plot_'+timestamp+'_filtered.png',dpi=120, bbox_inches = "tight")
            plt.close()
        else:
            pass

        #STORE DATA----------------------------------------------------------------------
        #timestamp = time2.strftime('%Y%m%d%H%M%S')

        #screening results:
        dfmatch_less_['gene'] = gene
        complete_df = pd.concat([complete_df, dfmatch_less_])


        #REPORT------------------------------------------------------------------

        report = { 'reference_mesh': ref_mesh_n,
                   #'filtered_pmidmesh': pmidmesh_after,
                   #'interesting_pmid': interesting_pmid,
                   #'interesting_rsid': interesting_rsid,
                   'starting_pmidmesh': starting_pmidmesh,
                   'starting_pmid' : starting_pmid,
                   'starting_mesh': starting_mesh,
                   'starting_rsid': lit1_rsid,
                   'matching_pmidmesh': matching_pmidmesh,
                   'matching_pmids': matching_pmids,
                   'matching_mesh': matching_mesh,
                   'matching_rsid': matching_rsid,
                   'dropped_rsid': dropped_rsid,
                   'matching_mesh_ratio': round((matching_mesh/starting_mesh),3),
                   'matching_pmids_ratio': round((matching_pmids/starting_pmid),3),
                   'matching_pmidmesh_ratio': round((matching_pmidmesh/starting_pmidmesh),3),
                   'matching_rsid_ratio': round((matching_rsid/lit1_rsid),3),
                   'matching_rsid_pmid10': matching_rsid_pmid10,
                   'matching_rsid_pmid100': matching_rsid_pmid100,
                   'matching_top10mesh':str(top10mesh),
                   'matching_top10rsid':str(top10rsid),
                   }

        df_report = pd.DataFrame(report, index=[gene]).transpose()

        # SLOW STEP!----------------------
        # generate fist report.csv

        #if os.path.isfile(directory+'/GRPMX_report.csv'):
        #    df_report_complete = pd.read_csv(directory+'/GRPMX_report.csv', index_col=0)#).set_index('Unnamed: 0')
        #    df_report_complete = pd.concat([df_report_complete, df_report], axis=1)
        #    df_report_complete.to_csv(directory+'/GRPMX_report.csv')
        #else:
        #    df_report.to_csv(directory+'/GRPMX_report.csv') # solo la prima volta

        # FASTER ALT------------
        df_report_complete = pd.concat([df_report_complete, df_report], axis=1)

        time_omega = datetime.now()
        full_runtime = time_omega - time_alpha
        print((gene+'_runtime:').ljust(18)+ str(full_runtime).ljust(15), ' Genes processed:', genes.index(gene), 'on', len(genes))
        total_seconds = full_runtime.total_seconds()

        # save checkpoint----------------------
        if genes.index(gene) > 1 and genes.index(gene) % checkpoint == 0:
            complete_df = complete_df.reindex(columns=['gene','rsid', 'pmids', 'mesh'])
            complete_df.to_csv(directory+'/grpmx_filtered_output.csv')
            df_report_complete.to_csv(directory+'/GRPMX_report.csv')
            print("saved checkpoint")
        else:
            pass

    else:
        print(gene+' not present in DataBase')
        pass

# Save complete csv
complete_df = complete_df.reindex(columns=['gene','rsid', 'pmids', 'mesh'])
complete_df.to_csv(directory+'/grpmx_filtered_output.csv')

df_report_complete.to_csv(directory+'/GRPMX_report.csv')

# #Update gene values (remove previous gene entry)
GRPMX_report = pd.read_csv(directory+'/GRPMX_report.csv', index_col=0)
time_load_1 = datetime.now()
for gene in grpm_genes_list:
    if gene+'.1' in GRPMX_report.columns:
        GRPMX_report = GRPMX_report.drop(columns = gene)
        GRPMX_report = GRPMX_report.rename(columns={gene+'.1': gene})
        #print(genes.index(gene))
    else:
        pass
time_load_2 = datetime.now()
print(time_load_2 - time_load_1)
GRPMX_report.to_csv(directory+'/GRPMX_report.csv')

time_finish = datetime.now()
time_batch = time_finish - time_start

if os.path.isfile('run_time.txt'):
    with open('run_time.txt', 'a') as file:
        file.write(topic_tag+':\n\ttime batch: '+str(time_batch)+'\n\truntime/gene: '+str(time_batch/len(genes))+'\n\n')
else:
    with open('run_time.txt', 'w') as file:
        file.write(topic_tag+':\n\ttime batch: '+str(time_batch)+'\n\truntime/gene: '+str(time_batch/len(genes))+'\n\n')

print('time batch:',time_batch)
print('runtime/gene:', time_batch/len(genes))

In [None]:
# save checkpoint
complete_df = complete_df.reindex(columns=['gene','rsid', 'pmids', 'mesh'])
complete_df.to_csv(directory+'/grpmx_filtered_output.csv')

df_report_complete.to_csv(directory+'/GRPMX_report.csv')

restart_from = len(df_report_complete.T)
print('partial job, genes in survey '+topic_tag+' report:', restart_from)

In [None]:
df_report_complete.T

In [None]:
df_read = pd.read_csv(directory+'/grpmx_filtered_output.csv', index_col=0)#.gene.drop_duplicates()
#df_read.to_clipboard()
print('genes matching:', df_read.gene.nunique())
print('mesh matching:', df_read.mesh.nunique())
print('apply threshold in Analizer Module')
df_read

# Check results

In [None]:
# Visualize GRPMX_report.csv
GRPMX_report = pd.read_csv(directory+'/GRPMX_report.csv', index_col=0).transpose().reset_index().rename(columns={'index':'gene'})
GRPMX_report.gene.drop_duplicates().to_clipboard()
print('Genes matching:',len(GRPMX_report.gene.drop_duplicates()))

GRPMX_report[['reference_mesh', 'starting_pmidmesh', 'starting_pmid','starting_mesh','starting_rsid', 'matching_pmidmesh', 'matching_pmids', 'matching_mesh','matching_rsid', 'dropped_rsid']] = GRPMX_report[['reference_mesh', 'starting_pmidmesh', 'starting_pmid','starting_mesh','starting_rsid', 'matching_pmidmesh', 'matching_pmids', 'matching_mesh','matching_rsid', 'dropped_rsid']].astype(int)

GRPMX_report[['matching_mesh_ratio', 'matching_pmids_ratio','matching_pmidmesh_ratio', 'matching_rsid_ratio']] = GRPMX_report[['matching_mesh_ratio', 'matching_pmids_ratio','matching_pmidmesh_ratio','matching_rsid_ratio']].astype(float)

columns_to_keep = ['matching_pmids','matching_pmids_ratio','matching_mesh','matching_rsid']
GRPMX_report_less = GRPMX_report[columns_to_keep]

sorting_column = 'matching_pmids'
GRPMX_report_sort = GRPMX_report.sort_values(by=sorting_column, ascending=False)

columns_to_display = ['gene', 'matching_pmidmesh', 'matching_pmids',
                      'matching_mesh', 'matching_rsid', 'dropped_rsid', 'matching_mesh_ratio',
                      'matching_pmids_ratio', 'matching_pmidmesh_ratio',
                      'matching_rsid_ratio']
GRPMX_report_display = GRPMX_report[columns_to_display]
GRPMX_report_display

In [None]:
# Matching PMIDs in Database
GRPMX_report_sort = GRPMX_report.sort_values(by= 'matching_pmids',ascending=False)

x = GRPMX_report_sort.gene.iloc[:40]
y = GRPMX_report_sort['matching_pmids'].iloc[:40]
plt.figure(figsize=(5, len(x)*0.2))
plt.title('Matching PMIDs in Database', loc='center',pad=10)

plt.barh(x,y)
plt.gca().invert_yaxis()
plt.tick_params(axis='x', which='both', top=True, bottom=False, labeltop=True, labelbottom=False)
#plt.xlabel('pmid count', position=(0.5, 1.08))
plt.ylabel('genes')
plt.xlabel('matching pmid', position=(0.5, 1.08))
ax = plt.gca()
ax.xaxis.set_label_position('top')

plt.show()

In [None]:
# Add "interest value" to report:----------------------------------------------------------
max_match_pmids = int(GRPMX_report['matching_pmids'].max())
GRPMX_report_int = GRPMX_report
GRPMX_report_int['matching_pmids_score'] = round((GRPMX_report_int['matching_pmids']/max_match_pmids),3)

GRPMX_report_int['interest_value'] = round(GRPMX_report_int['matching_pmids_score'] * GRPMX_report_int['matching_pmids_ratio'],3)

GRPMX_report_int.set_index('gene').sort_values(by='interest_value')#.T

In [None]:
# Matching PMIDs in Database
GRPMX_report_sort = GRPMX_report.sort_values(by= 'matching_pmids_index',ascending=False)

x = GRPMX_report_sort.gene.iloc[:100]
y = GRPMX_report_sort['matching_pmids_index'].iloc[:100]
plt.figure(figsize=(5, len(x)*0.2))
plt.title('Matching PMIDs in Database', loc='center',pad=10)

plt.barh(x,y)
plt.gca().invert_yaxis()
plt.tick_params(axis='x', which='both', top=True, bottom=False, labeltop=True, labelbottom=False)
#plt.xlabel('pmid count', position=(0.5, 1.08))
plt.ylabel('genes')
plt.xlabel('matching pmid', position=(0.5, 1.08))
ax = plt.gca()
ax.xaxis.set_label_position('top')

plt.show()