Setup environment

In [1]:
!kgtk

usage: kgtk [options] command [ / command]*


In [2]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/Users/grantxie/Downloads/NIH"

# The names of the output and temporary folders
output_folder = "useful_wikidata_files"
temp_folder = "temp.useful_wikidata_files"

# The location of input files
wiki_root_folder = "/Users/grantxie/Downloads/NIH/"
claims_file = "claims.tsv.gz"
label_file = "labels.en.tsv.gz"
alias_file = "aliases.en.tsv.gz"
description_file = "descriptions.en.tsv.gz"
item_file = "claims.wikibase-item.tsv.gz"

label_all = "labels.tsv.gz"
alias_all = "aliases.tsv.gz"
description_all = "descriptions.tsv.gz"

# Location of the cache database for kypher
cache_path = "/Users/grantxie/Downloads/NIH/temp.useful_wikidata_files"

# Whether to delete the cache database
delete_database = False

# Whether to compute pagerank as it may not run on the laptop
compute_pagerank = True
languages = 'ru,es,zh-cn,de,it,nl,pl,fr,pt,sv'

In [3]:
languages = languages.split(',')

In [4]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

import altair as alt

## Set up environment and folders to store the files

- `OUT` folder where the output files go
- `TEMP` folder to keep temporary files , including the database
- `kgtk` shortcut to invoke the kgtk software
- `kypher` shortcut to invoke `kgtk query with the cache database
- `CLAIMS` the `all.tsv` file of wikidata that contains all edges except label/alias/description
- `LABELS` the file with the English labels
- `ITEMS` the wikibase-item file (currently does not include node1 that are properties so for now we need the net file
- `STORE` location of the cache file

In [5]:
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
os.environ['OUT'] = "{}/{}".format(output_path, output_folder)
os.environ['TEMP'] = "{}/{}".format(output_path, temp_folder)
os.environ['kgtk'] = "kgtk"
os.environ['kgtk'] = "kgtk --debug"
os.environ['kypher'] = "kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['CLAIMS'] = wiki_root_folder + claims_file
os.environ['LABELS'] = wiki_root_folder + label_file
os.environ['ALIASES'] = wiki_root_folder + alias_file
os.environ['DESCRIPTIONS'] = wiki_root_folder + description_file
os.environ['ITEMS'] = wiki_root_folder + item_file

Echo the variables to see if they are all set correctly

In [6]:
!echo $OUT
!echo $TEMP
!echo $kgtk
!echo $kypher
!echo $CLAIMS
!echo $LABELS
!echo $ALIASES
!echo $LABELS
!echo $DESCRIPTIONSa
!echo $STORE
!alias col="column -t -s $'\t' "

/Users/grantxie/Downloads/NIH/useful_wikidata_files
/Users/grantxie/Downloads/NIH/temp.useful_wikidata_files
kgtk --debug
kgtk --debug query --graph-cache /Users/grantxie/Downloads/NIH/temp.useful_wikidata_files/wikidata.sqlite3.db
/Users/grantxie/Downloads/NIH/claims.tsv.gz
/Users/grantxie/Downloads/NIH/labels.en.tsv.gz
/Users/grantxie/Downloads/NIH/aliases.en.tsv.gz
/Users/grantxie/Downloads/NIH/labels.en.tsv.gz

/Users/grantxie/Downloads/NIH/temp.useful_wikidata_files/wikidata.sqlite3.db


Go to the output directory and create the subfolders for the output files and the temporary files

In [7]:
cd $output_path

/Users/grantxie/Downloads/NIH


In [8]:
!mkdir -p $OUT
!mkdir -p $TEMP

Clean up the output and temp folders before we start

In [9]:
# !rm $OUT/*.tsv $OUT/*.tsv.gz
# !rm $TEMP/*.tsv $TEMP/*.tsv.gz

In [10]:
if delete_database:
    print("Deleteddatabase") 
    !rm $STORE

In [11]:
!ls -l $OUT
!ls $TEMP
!ls -l "$CLAIMS"
!ls -l "$LABELS"
!ls -l "$ALIASES"
!ls -l "$LABELS"
!ls -l "$DESCRIPTIONS"
!ls $STORE

total 7418824
-rw-r--r--  1 grantxie  staff        9025 Jul 10 14:52 Untitled.ipynb
-rw-r--r--@ 1 grantxie  staff     4125423 Jul  9 11:27 coinvestigators 2.tsv
-rw-r--r--@ 1 grantxie  staff     4125423 Jul  9 11:56 coinvestigators 3.tsv
-rw-r--r--@ 1 grantxie  staff     4125423 Jul  9 11:58 coinvestigators 4.tsv
-rw-r--r--@ 1 grantxie  staff     2511815 Jul  9 12:26 coinvestigators.compact 2.tsv
-rw-r--r--@ 1 grantxie  staff     2511815 Jul  9 11:23 coinvestigators.compact.tsv
-rw-r--r--@ 1 grantxie  staff      780726 Dec 14 01:16 coinvestigators.compact.tsv.gz
-rw-r--r--@ 1 grantxie  staff     4125423 Jul  9 11:24 coinvestigators.tsv
-rw-r--r--@ 1 grantxie  staff      891571 Dec 14 01:15 coinvestigators.tsv.gz
-rw-r--r--  1 grantxie  staff        4378 Jul  8 20:46 coinvestigators11.tsv.gz
-rw-r--r--@ 1 grantxie  staff       92690 Jul  8 21:51 nih_investigators_for_tl 2.tsv
-rw-r--r--@ 1 grantxie  staff       92690 Jul  8 21:51 nih_investigators_for_tl 3.tsv
-rw-r--r--@ 1 grantxie  st

In [18]:
!$kypher \
-i "$CLAIMS" --as claim \
--limit 10


[2021-12-16 02:33:02 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_17 AS graph_17_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	node2;wikidatatype
P10-P1628-32b85d-7927ece6-0	P10	P1628	"http://www.w3.org/2006/vcard/ns#Video"	
P10-P1628-acf60d-b8950832-0	P10	P1628	"https://schema.org/video"	
P10-P1629-Q34508-bcc39400-0	P10	P1629	Q34508	
P10-P1659-P1651-c4068028-0	P10	P1659	P1651	
P10-P1659-P18-5e4b9c4f-0	P10	P1659	P18	
P10-P1659-P4238-d21d1ac0-0	P10	P1659	P4238	
P10-P1659-P51-86aca4c5-0	P10	P1659	P51	
P10-P1855-Q15075950-7eff6d65-0	P10	P1855	Q15075950	wikibase-item
P10-P1855-Q69063653-c8cdb04c-0	P10	P1855	Q69063653	wikibase-item
P10-P1855-Q7378-555592a4-0	P10	P1855	Q7378	wikibase-item


In [19]:
!$kypher \
-i claims.time.tsv.gz --as time \
--limit 10


[2021-12-16 02:33:12 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_35 AS graph_35_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	rank	node2;wikidatatype
P1841-P580-cd3f49-ffd020d2-0	P1841	P580	^2016-01-01T00:00:00Z/9	normal	time
P2847-P2669-f68f44-b96afbe1-0	P2847	P2669	^2019-04-02T00:00:00Z/11	normal	time
P3284-P576-ca3d11-2cb041ab-0	P3284	P576	^2016-12-13T00:00:00Z/11	normal	time
P370-P571-b4f929-4d28c153-0	P370	P571	^2013-03-29T00:00:00Z/11	normal	time
P6107-P580-4ba06f-3f696898-0	P6107	P580	^2006-03-01T00:00:00Z/10	normal	time
Q100-P571-0cfff8-9f2ee581-0	Q100	P571	^1630-09-07T00:00:00Z/11	normal	time
Q1000-P571-3520e1-745d2068-0	Q1000	P571	^1960-01-01T00:00:00Z/9	normal	time
Q10000-P571-f72b16-4786d163-0	Q10000	P571	^2001-06-19T00:00:00Z/11	normal	time
Q1000000-P580-52965a-0db5897e-0	Q1000000	P580	^2010-01-25T00:00:00Z/11	normal	time
Q100000001-P571-9cf0d1-a58775fe-0	Q100000001	P57

In [12]:
!$kypher \
-i "$LABELS" --as label \
--limit 10

[2021-08-16 20:14:13 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_18 AS graph_18_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2
P10-label-en	P10	label	'video'@en
P1000-label-en	P1000	label	'record held'@en
P1001-label-en	P1001	label	'applies to jurisdiction'@en
P1002-label-en	P1002	label	'engine configuration'@en
P1003-label-en	P1003	label	'National Library of Romania ID'@en
P1004-label-en	P1004	label	'MusicBrainz place ID'@en
P1005-label-en	P1005	label	'Portuguese National Library ID'@en
P1006-label-en	P1006	label	'Nationale Thesaurus voor Auteurs ID'@en
P1007-label-en	P1007	label	'Lattes Platform number'@en
P101-label-en	P101	label	'field of work'@en


In [13]:
!$kypher \
-i "$ITEMS" --as item \
--limit 10

[2021-08-16 20:14:16 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_21 AS graph_21_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	rank	node2;wikidatatype
P10-P1629-Q34508-bcc39400-0	P10	P1629	Q34508	normal	wikibase-item
P10-P1855-Q15075950-7eff6d65-0	P10	P1855	Q15075950	normal	wikibase-item
P10-P1855-Q4504-a69d2c73-0	P10	P1855	Q4504	normal	wikibase-item
P10-P1855-Q69063653-c8cdb04c-0	P10	P1855	Q69063653	normal	wikibase-item
P10-P1855-Q7378-555592a4-0	P10	P1855	Q7378	normal	wikibase-item
P10-P2302-Q21502404-d012aef4-0	P10	P2302	Q21502404	normal	wikibase-item
P10-P2302-Q21510851-5224fe0b-0	P10	P2302	Q21510851	normal	wikibase-item
P10-P2302-Q21510852-dde2f0ce-0	P10	P2302	Q21510852	normal	wikibase-item
P10-P2302-Q52004125-d0288d06-0	P10	P2302	Q52004125	normal	wikibase-item
P10-P2302-Q53869507-974ce3b1-0	P10	P2302	Q53869507	normal	wikibase-item


In [28]:
!$kypher \
-i "/Users/grantxie/Downloads/claims.external-id.tsv.gz" --as exid\
--limit 10

[2021-09-07 01:21:31 sqlstore]: IMPORT graph directly into table graph_52 from /Users/grantxie/Downloads/claims.external-id.tsv.gz ...
[2021-09-07 01:38:39 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_52 AS graph_52_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	rank	node2;wikidatatype
P1005-P2264-155572-b048c795-0	P1005	P2264	"4031"	normal	external-id
P1005-P2264-3ae570-ebd28455-0	P1005	P2264	"4032"	normal	external-id
P1014-P2264-a23718-8fffd32b-0	P1014	P2264	"48"	normal	external-id
P1015-P2264-134fad-a7e86696-0	P1015	P2264	"1617"	normal	external-id
P1015-P2264-7e9c1a-b2f829b5-0	P1015	P2264	"564"	normal	external-id
P1021-P2264-340fcd-59428d77-0	P1021	P2264	"88"	normal	external-id
P1043-P2264-797922-58a7ad5f-0	P1043	P2264	"248"	normal	external-id
P1045-P2264-d509ce-e51b9649-0	P1045	P2264	"46"	normal	external-id
P1047-P2264-63c077-2c42ccf1-0	P1047	P2264	"962"	normal	external-id
P1047-P

## Output files for Tableau

In [12]:
!$kypher -i item -i author \
--match 'item: (c)-[:P31]->(:Q16917)' \
-o hospital.tsv

[2021-12-14 01:30:55 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_21 AS graph_21_c1
     WHERE graph_21_c1."label" = ?
        AND graph_21_c1."node2" = ?
  PARAS: ['P31', 'Q16917']
---------------------------------------------


In [14]:
!$kypher -i item -i coor -i label\
--match 'item: (c)-[]->(:Q16917), coor:(c)-[:P625]->(d), label:(c)-[]->(e)'\
--return 'c, d, e' \
-o name1.tsv

[2021-12-14 01:31:04 query]: SQL Translation:
---------------------------------------------
  SELECT graph_21_c1."node1", graph_22_c2."node2", graph_18_c3."node2"
     FROM graph_18 AS graph_18_c3
     INNER JOIN graph_21 AS graph_21_c1, graph_22 AS graph_22_c2
     ON graph_21_c1."node1" = graph_18_c3."node1"
        AND graph_21_c1."node1" = graph_22_c2."node1"
        AND graph_21_c1."node2" = ?
        AND graph_22_c2."label" = ?
  PARAS: ['Q16917', 'P625']
---------------------------------------------


In [15]:
ls = []
for i in range(0,50):
    ls.append('/Users/grantxie/Downloads/with_nils_author/split_' +str(i) +".csv")

In [16]:
df = pd.read_csv('/Users/grantxie/Downloads/with_nils_author/split_0.csv')
df.head()

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,lof_class_count_tf_idf_score,top5_class_count,lof_property_count_tf_idf_score,top5_property_count,context_property,context_similarity,context_property_similarity_q_node,context_score,siamese_prediction,rank
0,0,0,LINDLEY BARBEE,UNIVERSITY OF WASHINGTON|CHRISTINE MITRA KHOSR...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0,LINDLEY BARBEE,NIL,,,...,,,,,,,,0.0,0.0,
1,0,1,CHRISTINE MITRA KHOSROPOUR,UNIVERSITY OF WASHINGTON|LINDLEY BARBEE,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0,CHRISTINE MITRA KHOSROPOUR,NIL,,,...,,,,,,,,0.0,0.0,
2,0,2,MICHAEL J GALE,UNIVERSITY OF WASHINGTON|CAROLYN B COYNE;HUGO ...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0,MICHAEL J GALE,Q6831539,Michael Gale|Michael J Gale,"M. J Gale|Michael J. Gale|M. J. Gale|Gale, M. J.",...,0.51241,Q36180:0.031|Q482980:0.030|Q2500638:0.030|Q702...,0.111416,P106:0.034|P735:0.028|P21:0.023|P27:0.009|P646...,P108|Pcoauthor,0.4506$$|0.45$$,P108/0.4506$$|Pcoauthor/0.45$$,0.3632,1.0,1.0
3,0,3,DANIEL M. RATNER,UNIVERSITY OF WASHINGTON|PATRICK S. STAYTON;SH...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0,DANIEL M. RATNER,NIL,,,...,,,,,,,,0.0,0.0,
4,0,4,SHAWN J. SKERRETT,UNIVERSITY OF WASHINGTON|COURTNEY CRANE;DANIEL...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0,SHAWN J. SKERRETT,NIL,,,...,,,,,,,,0.0,0.0,


In [17]:
original = pd.read_csv('/Users/grantxie/Downloads/tl_person_coinvestigator_with_qnihid.tsv', sep='\t')

In [18]:
original.head()

Unnamed: 0,person_name,organization_name,coinvestigator_names,person_qnihid
0,LINDLEY BARBEE,UNIVERSITY OF WASHINGTON,CHRISTINE MITRA KHOSROPOUR,QNIHPER11081386
1,CHRISTINE MITRA KHOSROPOUR,UNIVERSITY OF WASHINGTON,LINDLEY BARBEE,QNIHPER12074317
2,MICHAEL J GALE,UNIVERSITY OF WASHINGTON,CAROLYN B COYNE;HUGO RAMON ROSEN;JOAN M GOVERM...,QNIHPER1940486
3,DANIEL M. RATNER,UNIVERSITY OF WASHINGTON,PATRICK S. STAYTON;SHAWN J. SKERRETT;TIMOTHY E...,QNIHPER6623689
4,SHAWN J. SKERRETT,UNIVERSITY OF WASHINGTON,COURTNEY CRANE;DANIEL M. RATNER;KATHIE ANNE WA...,QNIHPER1898012


In [19]:
arr = []
id_t = []
qid = []
label = []
kg = []
mem = []
graph = []
for file in ls:
    print(file)
    df = pd.read_csv(file)
    for i in range(0, len(df)):
        if df['row'][i] in arr:
            continue
        arr.append(df['row'][i])
        label.append(df.label[i])
        kg.append(df['kg_id'][i])
        id_t.append('')
        mem.append('label')
        graph.append('author')
        qid.append(original['person_qnihid'][int(df['row'][i])])

/Users/grantxie/Downloads/with_nils_author/split_0.csv
/Users/grantxie/Downloads/with_nils_author/split_1.csv
/Users/grantxie/Downloads/with_nils_author/split_2.csv
/Users/grantxie/Downloads/with_nils_author/split_3.csv
/Users/grantxie/Downloads/with_nils_author/split_4.csv
/Users/grantxie/Downloads/with_nils_author/split_5.csv
/Users/grantxie/Downloads/with_nils_author/split_6.csv
/Users/grantxie/Downloads/with_nils_author/split_7.csv
/Users/grantxie/Downloads/with_nils_author/split_8.csv
/Users/grantxie/Downloads/with_nils_author/split_9.csv
/Users/grantxie/Downloads/with_nils_author/split_10.csv
/Users/grantxie/Downloads/with_nils_author/split_11.csv
/Users/grantxie/Downloads/with_nils_author/split_12.csv
/Users/grantxie/Downloads/with_nils_author/split_13.csv
/Users/grantxie/Downloads/with_nils_author/split_14.csv
/Users/grantxie/Downloads/with_nils_author/split_15.csv
/Users/grantxie/Downloads/with_nils_author/split_16.csv
/Users/grantxie/Downloads/with_nils_author/split_17.csv
/U

In [20]:
new = pd.DataFrame(list(zip(id_t,qid,mem,kg,arr, label,graph)), columns = ['id', 'node1', 'label', 'node2', 'row', 'node2;label','graph'])

In [21]:
new

Unnamed: 0,id,node1,label,node2,row,node2;label,graph
0,,QNIHPER11081386,label,NIL,0,LINDLEY BARBEE,author
1,,QNIHPER12074317,label,NIL,1,CHRISTINE MITRA KHOSROPOUR,author
2,,QNIHPER1940486,label,Q6831539,2,MICHAEL J GALE,author
3,,QNIHPER6623689,label,NIL,3,DANIEL M. RATNER,author
4,,QNIHPER1898012,label,NIL,4,SHAWN J. SKERRETT,author
...,...,...,...,...,...,...,...
51563,,QNIHPER8107460,label,NIL,51564,RAINA N. FICHOROVA,author
51564,,QNIHPER2272576,label,NIL,51565,DOROTHEA DENISE JENKINS,author
51565,,QNIHPER16431128,label,NIL,51566,JACQUELINE GENOVESI,author
51566,,QNIHPER16505596,label,NIL,51567,JENNIFER JOVANOVIC,author


In [22]:
new = new[new['node2'] != 'NIL']

In [23]:
new = new.reset_index(drop = True)

In [24]:
new

Unnamed: 0,id,node1,label,node2,row,node2;label,graph
0,,QNIHPER1940486,label,Q6831539,2,MICHAEL J GALE,author
1,,QNIHPER2414340,label,Q87796696,5,PATRICK S. STAYTON,author
2,,QNIHPER11537130,label,Q89958961,8,KENNETH NGURE,author
3,,QNIHPER8820221,label,Q61467477,10,ANITHA PASUPATHY,author
4,,QNIHPER8491201,label,Q46001993,11,STEPHANIE M FULLERTON,author
...,...,...,...,...,...,...,...
21905,,QNIHPER1865473,label,Q88874640,51553,THERESA L WALUNAS,author
21906,,QNIHPER77800199,label,Q17232943,51555,HEATHER RYAN,author
21907,,QNIHPER16138148,label,Q59751913,51557,TAD SIMONS,author
21908,,QNIHPER6983267,label,Q88663895,51558,VERA P KRYMSKAYA,author


In [25]:
for i in range(0, len(new)):
    new['id'][i] = 'author' + str(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
new.to_csv('/Users/grantxie/Downloads/nig_kgtk.tsv', index = False, sep = '\t')

Output files

In [28]:
!$kypher \
-i "/Users/grantxie/Downloads/kgtk-master/use-cases/nig_kgtk.tsv" --as nih \
--limit 10

[2021-12-14 01:40:13 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_23 AS graph_23_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	row	graph
qnode0	Q219563	label	UNIVERSITY OF WASHINGTON	0	qnode
qnode1	Q5081831	label	CHARLES R. DREW UNIVERSITY OF MEDICAL & SCIENCE	1	qnode
qnode2	Q622664	label	UNIVERSITY OF CALIFORNIA, SAN DIEGO	2	qnode
qnode3	Q812573	label	BAYLOR UNIVERSITY	4	qnode
qnode4	Q49118	label	BOSTON COLLEGE	5	qnode
qnode5	Q6806451	label	MEDICAL UNIVERSITY OF SOUTH CAROLINA	6	qnode
qnode6	Q457281	label	UNIVERSITY OF ILLINOIS AT URBANA-CHAMPAIGN	7	qnode
qnode7	Q168751	label	DUKE UNIVERSITY	9	qnode
qnode8	Q349055	label	WAYNE STATE UNIVERSITY	10	qnode
qnode9	Q104480607	label	NORTHAMPTON VA MEDICAL CENTER	11	qnode


In [29]:
!$kypher \
-i "/Users/grantxie/Downloads/nih_projects/nih_project_new.id.tsv" --as project \
--limit 5

[2021-12-14 01:43:19 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_31 AS graph_31_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
id	node1	label	node2
QNIHPRO10286324-PNIHtitle-cfa7f1	QNIHPRO10286324	PNIHtitle	"CD38-TARGETED IMMUNOPET OF MYELOMA: PHASE 2 TRIAL OF CLINICAL APPLICATIONS"
QNIHPRO10286324-PNIHorg-17451a	QNIHPRO10286324	PNIHorg	"HOAG MEMORIAL HOSPITAL PRESBYTERIAN"
QNIHPRO10286324-PNIHcost-452920	QNIHPRO10286324	PNIHcost	598926
QNIHPRO10286324-PNIHsub-730793	QNIHPRO10286324	PNIHsub	"Antibodies; base; Biopsy; blind; Blood Tests; Bone marrow biopsy; burden of illness; cancer imaging; Cells; Clinical; clinical application; clinical care; Clinical Trials; clinically significant; Combined Modality Therapy; design; Detection; Detection of Minimal Residual Disease; Disease; dosimetry; early phase trial; experience; first-in-human; fluorodeoxyglucose positron emission tomography; Funding Opportunities; F

In [57]:
!$kypher \
-i "$OUT"/coinvestigators.compact.tsv.gz --as coinvestigator \
--limit 5


[2021-12-14 02:34:21 sqlstore]: DROP graph data table graph_27 from coinvestigator
[2021-12-14 02:34:21 sqlstore]: IMPORT graph directly into table graph_65 from /Users/grantxie/Downloads/NIH/useful_wikidata_files/coinvestigators.compact.tsv.gz ...
[2021-12-14 02:34:21 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_65 AS graph_65_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
node1	label	node2
QNIHPER10001049	Pcoinvestigator_names	BARBARA ANN CHURCH|EDUARDO MERCADO|JONATHAN D. RODGERS
QNIHPER10001128	Pcoinvestigator_names	MARK A. MC NIVEN
QNIHPER10001819	Pcoinvestigator_names	ABRAHAM NA SHAKED|BRIAN DONALD PIENING|JUHI KUMAR|KRZYSZTOF KIRYLUK|MALEK KAMOUN|MARIO C. DENG|MICHAEL P. SNYDER|SANDRA AMARAL
QNIHPER10003131	Pcoinvestigator_names	AARON PAUL BATISTA|BYRON M. YU
QNIHPER10003688	Pcoinvestigator_names	BOB WONG|JANICE MARGARET MORSE


In [43]:
!$kypher \
-i "author_hospital.id.tsv" --as hosp \
--limit 5



[2021-11-14 00:31:33 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_37 AS graph_37_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
id	node1	label	node2
QNIHPER10003954-member-Q39050124	QNIHPER10003954	member	Q39050124
QNIHPER10428590-member-Q625321	QNIHPER10428590	member	Q625321
QNIHPER10004092-member-Q5582923	QNIHPER10004092	member	Q5582923
QNIHPER12571233-member-Q28035413	QNIHPER12571233	member	Q28035413
QNIHPER2098890-member-Q50036737	QNIHPER2098890	member	Q50036737


In [31]:
!$kypher \
-i "/Users/grantxie/Downloads/claims.globe-coordinate.tsv" --as coor \
--limit 10

[2021-12-14 01:52:24 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_22 AS graph_22_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	rank	node2;wikidatatype
Q100-P625-2e35d9-cdfcaf0e-0	Q100	P625	@42.358333333333/-71.0625	normal	globe-coordinate
Q1000-P1332-b48e3a-0ed57897-0	Q1000	P1332	@2.32/11.7	normal	globe-coordinate
Q1000-P1333-3c3826-e820db87-0	Q1000	P1333	@-3.96005/11.15322	normal	globe-coordinate
Q1000-P1334-44067c-eebd06bd-0	Q1000	P1334	@-0.61746/14.5266	normal	globe-coordinate
Q1000-P1335-a8c32c-37c5bb67-0	Q1000	P1335	@-0.62444444/8.70805556	normal	globe-coordinate
Q1000-P625-fa52c4-ce6c2e1f-0	Q1000	P625	@-0.68333055555556/11.5	normal	globe-coordinate
Q100000-P625-58c345-14fbc9fe-0	Q100000	P625	@50.8283/5.7678	normal	globe-coordinate
Q100000001-P625-acd0bb-fb89d825-0	Q100000001	P625	@-35.19629722222222/149.14484166666668	normal	globe-coordinate
Q100000034-P625-e952d9-aea04e31-0	Q1

In [37]:
pwd

'/Users/grantxie/Downloads/NIH'

In [20]:
!$kypher -i author -i nih_author -i inv -i item -i coor -i label -i project -i hosp -i time -i quantities\
--match 'author: (cluster)-[]->(qnode), nih_author:(qnihauthor)-[]->(qnode), inv:(project)-[]->(qnihauthor), \
        item:(qnode)-[:P108]->(org_node), label:(org_node)-[]->(org_name),label:(qnode)-[]->(author_name), \
        coor:(org_node)-[:P625]->(organization_coor), hosp:(qnihauthor)-[]->(hosp_node),\
        label:(hosp_node)-[]->(hosp_name), coor:(hosp_node)-[:P625]->(hosp_coor),item:(paper)-[:P50]->(qnode),item:(paper)-[:P921]->(mai)'\
--opt 'time: (hosp_node)-[:P571]->(hosp_inc)' \
--opt 'time: (org_node)-[:P571]->(org_inc)' \
--opt 'quantities: (hosp_node)-[:P6801]->(count_beds)' \
--opt 'project: (project)-[:PNIHcost]->(cost)' \
--return 'cluster as cluster_id, org_name as org_name, sum(cost) as organization_award, \
        kgtk_geo_coords_lat(organization_coor) as org_lat, kgtk_geo_coords_long(organization_coor) as org_long,\
        kgtk_date_year(org_inc) as org_inc, \
        hosp_name as hosp_name, kgtk_geo_coords_lat(hosp_coor) as hosp_lat, kgtk_geo_coords_long(hosp_coor) as hosp_long,\
        kgtk_date_year(hosp_inc) as hosp_inc, count_beds as count_beds' \
-o bed_inception_1.tsv

[2021-12-16 02:34:09 query]: SQL Translation:
---------------------------------------------
  SELECT graph_19_c1."node1" "_aLias.cluster_id", graph_18_c5."node2" "_aLias.org_name", sum(graph_31_c16."node2") "_aLias.organization_award", kgtk_geo_coords_lat(graph_22_c7."node2") "_aLias.org_lat", kgtk_geo_coords_long(graph_22_c7."node2") "_aLias.org_long", kgtk_date_year(graph_35_c14."node2") "_aLias.org_inc", graph_18_c9."node2" "_aLias.hosp_name", kgtk_geo_coords_lat(graph_22_c10."node2") "_aLias.hosp_lat", kgtk_geo_coords_long(graph_22_c10."node2") "_aLias.hosp_long", kgtk_date_year(graph_35_c13."node2") "_aLias.hosp_inc", graph_38_c15."node2" "_aLias.count_beds"
     FROM graph_18 AS graph_18_c5
     INNER JOIN graph_18 AS graph_18_c6, graph_18 AS graph_18_c9, graph_19 AS graph_19_c1, graph_21 AS graph_21_c11, graph_21 AS graph_21_c12, graph_21 AS graph_21_c4, graph_22 AS graph_22_c10, graph_22 AS graph_22_c7, graph_30 AS graph_30_c2, graph_32 AS graph_32_c3, graph_37 AS graph_37_c8
 

In [41]:
df = pd.read_csv('bed_inception_1.tsv',  sep = '\t')
df['count_beds'].fillna(value=df['count_beds'].mean(), inplace=True)
df.to_csv('bed_incepition_imputed.tsv', index = False, sep = '\t')

In [42]:
df

Unnamed: 0,cluster_id,org_name,organization_award,org_lat,org_long,org_inc,hosp_name,hosp_lat,hosp_long,hosp_inc,count_beds
0,cluster_0_0_0_0_0_0_0_6_93,'Massachusetts Institute of Technology'@en,4658658.0,42.359820,-71.092110,1861.0,'Cambridge Health Alliance'@en,42.374800,-71.104800,1996.0,617.830986
1,cluster_0_0_0_0_0_0_0_6_93,'Whitehead Institute'@en,4658658.0,42.363132,-71.089476,1982.0,'Cambridge Health Alliance'@en,42.374800,-71.104800,1996.0,617.830986
2,cluster_0_0_0_0_0_0_39_106_193,'Vanderbilt University'@en,2175747.0,36.148649,-86.804972,1873.0,'Children’s Hospital at TriStar Centennial'@en,36.153508,-86.807237,,617.830986
3,cluster_0_0_0_0_0_14_19_25_126,"'University of California, San Diego'@en",,32.881000,-117.238000,1960.0,'Jacobs Medical Center'@en,32.877703,-117.226499,2016.0,364.000000
4,cluster_0_0_0_0_0_14_66_154_718,'Stanford University'@en,,37.428229,-122.168858,1891.0,'Stanford University Medical Center'@en,37.434000,-122.175000,1959.0,613.000000
...,...,...,...,...,...,...,...,...,...,...,...
294,cluster_0_1_4_7_9_16_129_368_1416,"'University of California, Berkeley'@en",8727072.0,37.870000,-122.259000,1868.0,'Children\'s Hospital Oakland'@en,37.837500,-122.267000,1912.0,191.000000
295,cluster_0_1_4_7_9_16_129_368_1416,'University of Washington'@en,8727072.0,47.654167,-122.308056,1861.0,'Children\'s Hospital Oakland'@en,37.837500,-122.267000,1912.0,191.000000
296,cluster_0_1_4_7_9_16_21_144_1142,'California Institute of Technology'@en,,34.137500,-118.125000,1891.0,'Huntington Hospital'@en,34.134500,-118.153000,1892.0,625.000000
297,cluster_0_1_4_7_9_16_21_144_1142,'Stanford University'@en,,37.428229,-122.168858,1891.0,'Huntington Hospital'@en,34.134500,-118.153000,1892.0,625.000000


In [43]:
df.to_excel('bed_incepition_imputed_1.xlsx', index = False)

In [27]:
df = pd.read_csv('nih_sum_cost.tsv', sep = '\t')
df['total_cost'].fillna(value=df['total_cost'].mean(), inplace=True)

In [32]:
df.to_excel('/Users/grantxie/Downloads/nih_sum_cost_by_org.xlsx', index = False)

In [20]:
!$kypher \
-i "/Users/grantxie/Downloads/claims.quantity.tsv.gz" --as quantities\
--limit 10

[2021-08-17 05:03:06 sqlstore]: IMPORT graph directly into table graph_38 from /Users/grantxie/Downloads/claims.quantity.tsv.gz ...
[2021-08-17 05:12:08 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_38 AS graph_38_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	rank	node2;wikidatatype
P1004-P4876-b5e617-af81a4c5-0	P1004	P4876	+37427	normal	quantity
P1014-P4876-605b34-fbb0b790-0	P1014	P4876	+53249	preferred	quantity
P1014-P4876-62c45c-db5d6620-0	P1014	P4876	+50825	normal	quantity
P1014-P4876-732e1a-9ea8b31a-0	P1014	P4876	+47267	normal	quantity
P1014-P4876-f7a212-293bd556-0	P1014	P4876	+46591	normal	quantity
P1022-P4876-fa0ca8-40e422f8-0	P1022	P4876	+541	normal	quantity
P1024-P4876-8fb399-9696d4ac-0	P1024	P4876	+857	normal	quantity
P1025-P4876-f0dc63-acd0fe33-0	P1025	P4876	+13000000	normal	quantity
P1042-P1114-27b781-00b6c31d-0	P1042	P1114	+1952404	normal	quantity
P1044-P4876-d72e5b-569d2

In [None]:
!$kypher -i item -i coor -i label\
--match 'item: (c)-[]->(:Q16917), coor:(c)-[:P625]->(d), label:(c)-[]->(e)'\
--force \
--return 'c, d, e' \
-o name1.tsv

In [34]:
!$kypher \
-i "/Users/grantxie/Downloads/nih_match_hospital.tsv" --as hospital\
--limit 10

[2021-08-17 04:22:37 sqlstore]: IMPORT graph directly into table graph_36 from /Users/grantxie/Downloads/nih_match_hospital.tsv ...
[2021-08-17 04:22:37 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_36 AS graph_36_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
node1	node2	coordinates	nearest_label	nearest_coordinates	distance
Q219563	UNIVERSITY OF WASHINGTON	@47.65416666666667/-122.30805555555555	'University of Washington Medical Center'@en	@47.6489232/-122.3065945	2.9628626020887572e-05
Q5081831	CHARLES R. DREW UNIVERSITY OF MEDICAL & SCIENCE	@33.925634/-118.242594	'Martin Luther King, Jr. Multi-Service Ambulatory Care Center'@en	@33.9238/-118.242	3.7163919999995717e-06
Q622664	UNIVERSITY OF CALIFORNIA, SAN DIEGO	@32.881/-117.238	'Jacobs Medical Center'@en	@32.877703/-117.226499	0.00014314320999992013
Q812573	BAYLOR UNIVERSITY	@31.5472/-97.1139	'Scott & White Memorial Hospital'@en	@31.07753/-97.36383	0.28

In [25]:
!$kypher \
-i "/Users/grantxie/Downloads/kgtk-master/use-cases/nih_author_kgtk.tsv" --as nih_author \
--limit 10

[2021-08-16 20:30:03 sqlstore]: IMPORT graph directly into table graph_30 from /Users/grantxie/Downloads/kgtk-master/use-cases/nih_author_kgtk.tsv ...
[2021-08-16 20:30:03 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_30 AS graph_30_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	row	node2;label	graph
author0	QNIHPER1940486	label	Q6831539	2	MICHAEL J GALE	author
author1	QNIHPER2414340	label	Q87796696	5	PATRICK S. STAYTON	author
author2	QNIHPER11537130	label	Q89958961	8	KENNETH NGURE	author
author3	QNIHPER8820221	label	Q61467477	10	ANITHA PASUPATHY	author
author4	QNIHPER8491201	label	Q46001993	11	STEPHANIE M FULLERTON	author
author5	QNIHPER7882742	label	Q89866816	13	MARION PEPPER	author
author6	QNIHPER8146532	label	Q89937644	14	HORACIO O DE LA IGLESIA	author
author7	QNIHPER10499986	label	Q56809498	17	KEVIN HYBISKE	author
author8	QNIHPER2315018	label	Q87887672	18	KARIN E. BORNFELDT	author

In [36]:
!$kypher -i item -i coor -i nih\
--match 'nih:(c)-[]->(d), coor:(c)-[:P625]->(e)'\
--return 'c, d, e' \
-o name_nih_new.tsv

[2021-12-14 01:55:06 query]: SQL Translation:
---------------------------------------------
  SELECT graph_23_c1."node1", graph_23_c1."node2", graph_22_c2."node2"
     FROM graph_22 AS graph_22_c2
     INNER JOIN graph_23 AS graph_23_c1
     ON graph_23_c1."node1" = graph_22_c2."node1"
        AND graph_22_c2."label" = ?
  PARAS: ['P625']
---------------------------------------------


## Calculating closest hospital for organizations (using kdtree)

In [13]:
df = pd.read_csv('name1.tsv', sep = '\t')

In [14]:
df

Unnamed: 0,node1,node2,node2.1
0,Q1000307,@48.7814/9.16432,'Diakonie-Klinikum Stuttgart'@en
1,Q1000479,@42.3373/-71.106,'Boston Children\'s Hospital'@en
2,Q100104907,@-0.9209444444444445/100.45713888888889,'Universitas Andalas Hospital'@en
3,Q100109831,@-0.9508333333333333/100.36755555555555,'BMC hospital'@en
4,Q100118787,@-0.870638888888889/100.38336111111111,'Siti Rahmah Islamic hospital'@en
...,...,...,...
13134,Q99755413,@-19.98027777777778/-43.94416666666667,'Hospital Vila da Serra'@en
13135,Q99762779,@41.66912/-87.81374,'Palos Community Hospital'@en
13136,Q99809368,@48.5721205/-68.2180277,'Centre hospitalier de La Mitis'@en
13137,Q99933648,@46.9526/7.44997,'Viktoriaspital'@en


In [38]:
x = []
y = []

for i in range(0, len(df)):
    x.append(df.node2[i][1:].split('/')[0])
    y.append(df.node2[i][1:].split('/')[1])

In [39]:
x_c = []
y_c = []

for i in range(0, len(df)):
    x_c.append(float(x[i]))
    y_c.append(float(y[i]))

In [40]:
import kdtree

In [41]:
emptyTree = kdtree.create(dimensions=2)

In [42]:
arr = []
for i in range(0, len(df)):
    arr.append((x_c[i], y_c[i]))


In [24]:
tree = kdtree.create(arr)

In [43]:
nih = pd.read_csv('/Users/grantxie/Downloads/cluster_coor.tsv', sep = '\t')

In [44]:
nih

Unnamed: 0,id,node1,label,node2,node2;coor,node2;label
0,,cluster_0_0_2_2_14_112_322,member,QNIHPER10003954,@41.504/-81.608,'Case Western Reserve University'@en
1,,cluster_0_0_2_2_14_112_322,member,QNIHPER10428590,@37.5625/126.945,'Ewha Womans University'@en
2,,cluster_0_0_3_6_21_40_1800,member,QNIHPER10004092,@44.564588/-123.275705,'Oregon State University'@en
3,,cluster_0_0_3_6_21_40_1238,member,QNIHPER12571233,@42.447222222222/-76.483055555556,'Cornell University'@en
4,,cluster_0_0_3_6_21_40_1238,member,QNIHPER2098890,@46.343224/-119.276333,'Pacific Northwest National Laboratory'@en
...,...,...,...,...,...,...
2377,,cluster_0_0_2_2_14_8_259,member,QNIHPER9634058,@35.908611111111/-79.049166666667,'University of North Carolina at Chapel Hill'@en
2378,,cluster_0_0_3_6_7_2_843,member,QNIHPER9728962,@38.907222222222/-77.072777777778,'Georgetown University'@en
2379,,cluster_0_0_3_6_7_2_2378,member,QNIHPER9883306,@38.912/-77.077,'Georgetown University Medical Center'@en
2380,,cluster_0_0_3_6_21_77_1289,member,QNIHPER9807979,@40.73/-73.995,'New York University'@en


In [51]:
nih_vec = []
x = []
y = []
for i in range(0, len(nih)):
    x.append(nih['node2;coor'][i][1:].split('/')[0])
    y.append(nih['node2;coor'][i][1:].split('/')[1])

In [52]:
results = []
for i in range(0, len(nih)):
    results.append(tree.search_nn((float(x[i]), float(y[i]))))

In [53]:
results[1][0].data

(37.5624, 126.941)

In [54]:
total = []
for i in range(0, len(x_c)):
    total.append((x_c[i], y_c[i]))

In [55]:
tree.data[0]

38.248967

In [56]:
total.index(results[0][0].data)

7451

In [57]:
index = []
for i in range(0, len(results)):
    index.append(total.index(results[i][0].data))

In [58]:
nih['nearest_node'] = ''
nih['nearest_label'] = ''
nih['nearest_coordinates'] = ''

In [59]:
for i in range(0, len(nih)):
    nih['nearest_node'][i] = df['node1'][index[i]]
    nih['nearest_label'][i] = df['node2.1'][index[i]]
    nih['nearest_coordinates'][i] = df['node2'][index[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [60]:
nih['distance'] = -1.1

In [61]:
for i in range(0, len(nih)):
    nih['distance'][i] = results[i][1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [62]:
nih.head()

Unnamed: 0,id,node1,label,node2,node2;coor,node2;label,nearest_node,nearest_label,nearest_coordinates,distance
0,,cluster_0_0_2_2_14_112_322,member,QNIHPER10003954,@41.504/-81.608,'Case Western Reserve University'@en,Q39050124,'Case Comprehensive Cancer Center'@en,@41.504/-81.608,0.0
1,,cluster_0_0_2_2_14_112_322,member,QNIHPER10428590,@37.5625/126.945,'Ewha Womans University'@en,Q625321,'Severance Hospital'@en,@37.5624/126.941,1.6e-05
2,,cluster_0_0_3_6_21_40_1800,member,QNIHPER10004092,@44.564588/-123.275705,'Oregon State University'@en,Q5582923,'Good Samaritan Regional Medical Center'@en,@44.6022/-123.252,0.001977
3,,cluster_0_0_3_6_21_40_1238,member,QNIHPER12571233,@42.447222222222/-76.483055555556,'Cornell University'@en,Q28035413,'Cayuga Medical Center'@en,@42.469273/-76.53727,0.003425
4,,cluster_0_0_3_6_21_40_1238,member,QNIHPER2098890,@46.343224/-119.276333,'Pacific Northwest National Laboratory'@en,Q50036737,'Kadlec Clinic'@en,@46.28146/-119.281346,0.00384


In [63]:
output = pd.DataFrame(list(zip(nih['id'], nih.node2, nih.label, nih.nearest_node)) ,columns = ['id', 'node1', 'label', 'node2'])

In [64]:
output.to_csv('author_hospital.tsv', index = False, sep = '\t')

In [66]:
!$kgtk add-id --id-style wikidata \
-i "author_hospital.tsv" -o author_hospital.id.tsv

In [65]:
output

Unnamed: 0,id,node1,label,node2
0,,QNIHPER10003954,member,Q39050124
1,,QNIHPER10428590,member,Q625321
2,,QNIHPER10004092,member,Q5582923
3,,QNIHPER12571233,member,Q28035413
4,,QNIHPER2098890,member,Q50036737
...,...,...,...,...
2377,,QNIHPER9634058,member,Q30288421
2378,,QNIHPER9728962,member,Q5547057
2379,,QNIHPER9883306,member,Q5547057
2380,,QNIHPER9807979,member,Q22061077


In [31]:
group = nih.groupby('node1')

In [32]:
def get_cluster_dist(cluster):
    
    group = nih.groupby('node1')
    df = group.get_group(cluster)
    l = df.nearest_label.unique()
    org = df.groupby('nearest_label')

    c1 = []
    c2 = []
    for i in range(0, len(l)):
        c1.append(l[i][1:len(l[i])-4])
        c2.append(org.get_group(l[i]).count()[1])
        
    df = pd.DataFrame(list(zip(c1,c2)), columns = ['org', 'count'])
    return df.sort_values(by='count', ascending=False).reset_index(drop = True)

In [33]:
get_cluster_dist('cluster_0_0_2_2_14_112_322')

Unnamed: 0,org,count
0,Washington University Medical Center,7
1,Brigham and Women\'s Hospital,5
2,VA Center for Clinical Management Research,5
3,Center for Emergency Medicine of Western Penns...,4
4,NYU Langone Medical Center,4
...,...,...
57,The Peak Mark Clinic,1
58,St. John\'s Episcopal Hospital,1
59,Bloomingdale Insane Asylum,1
60,Morgan Stanley Children\'s Hospital,1


In [34]:
import math
def get_cluster_mean(cluster):
    df_temp = pd.read_csv('name1.tsv', sep = '\t')
    group = nih.groupby('node1')
    df = group.get_group(cluster).reset_index(drop = True)

    df_vec = []
    x = []
    y = []
    for i in range(0, len(df)):
        x.append(float(df['node2;coor'][i][1:].split('/')[0]))
        y.append(float(df['node2;coor'][i][1:].split('/')[1]))
        
    x1 = float(sum(x))/float(len(x))
    y1 = float(sum(y))/float(len(y))
    #print(x1 ,y1)
    res = tree.search_nn([x1, y1])
    #print(df_temp['node2.1'][total.index(results[i][0].data)][1: len(df_temp['node2.1'][total.index(results[i][0].data)])-4])
    #print(res[0])
    
    tot = 0.0
    
    for i in range(0, len(x)):
        tot = tot + 111 * math.sqrt( (res[0].data[0]-x[i])**2 + (res[0].data[1]-y[i])**2 )
    #print(tot/len(x))
    
    return (df_temp['node2.1'][total.index(results[i][0].data)][1: len(df_temp['node2.1'][total.index(results[i][0].data)])-4], tot/len(x))


In [35]:
get_cluster_mean('cluster_0_0_2_2_14_112_322')

('Penn Medicine Princeton Medical Center', 1901.3523863405317)

In [36]:
name = []
dist = []
cl = []
count = []
cluster = nih.node1.unique()
label = []

for i in range(0, len(cluster)):
    group = nih.groupby('node1')
    cl.append(cluster[i])
    count.append(len(group.get_group(cluster[i])))
    temp = get_cluster_mean(cluster[i])
    name.append(temp[0])
    dist.append(temp[1])
    label.append('Near')

In [37]:
output = pd.DataFrame(list(zip(cl,name, label, dist,count,)), columns = ['node1', 'label', 'node2', 'node2;dist', 'node1;count'])

In [38]:
output.sort_values(by='node2;dist', ascending=True).reset_index(drop = True)

Unnamed: 0,node1,label,node2,node2;dist,node1;count
0,cluster_0_0_3_5_15_45_215,Case Comprehensive Cancer Center,Near,0.357465,1
1,cluster_0_0_3_5_30_39_104,Cayuga Medical Center,Near,1.564715,4
2,cluster_0_0_2_2_14_146_1984,Cayuga Medical Center,Near,19.162157,4
3,cluster_0_0_3_5_15_45_1068,Severance Hospital,Near,46.704682,2
4,cluster_0_0_2_3_23_60_1880,Severance Hospital,Near,170.079363,2
...,...,...,...,...,...
144,cluster_0_0_3_6_21_40_623,Spaulding Hospital,Near,4858.154756,9
145,cluster_0_0_2_2_9_93_2350,James Whitcomb Riley Hospital for Children,Near,5135.594781,18
146,cluster_0_0_3_6_11_61_1689,Kadlec Clinic,Near,5234.390724,5
147,cluster_0_0_3_5_30_82_1616,Baylor College of Medicine,Near,6702.576310,10


In [39]:
output.sort_values(by='node2;dist', ascending=True).reset_index(drop = True).to_csv('hospital_mean_distance.tsv', index = False, sep = '\t')

In [40]:
pwd

'/Users/grantxie/Downloads/NIH'

Unnamed: 0,node1,label,node2,node2;dist,node1;count
0,cluster_0_0_2_2_14_112_322,Penn Medicine Princeton Medical Center,Near,1901.352386,111
1,cluster_0_0_3_6_21_40_1800,Norman Regional Hospital,Near,3734.246675,6
2,cluster_0_0_3_6_21_40_1238,Baylor College of Medicine,Near,1504.803572,10
3,cluster_0_0_2_3_22_129_391,University of New Mexico Hospital,Near,891.357219,16
4,cluster_0_0_3_5_30_15_1005,NYU Langone Medical Center,Near,1670.367667,13
...,...,...,...,...,...
144,cluster_0_0_2_3_23_85_782,Cayuga Medical Center,Near,1034.161456,4
145,cluster_0_0_2_3_24_103_2290,Good Samaritan Regional Medical Center,Near,1764.546490,3
146,cluster_0_0_2_3_24_103_1216,Cayuga Medical Center,Near,4119.954933,4
147,cluster_0_0_3_5_30_39_620,Stony Brook University Hospital,Near,3813.832220,7


In [42]:
nih

Unnamed: 0,id,node1,label,node2,node2;coor,node2;label,nearest_label,nearest_coordinates,distance
0,,cluster_0_0_2_2_14_112_322,member,QNIHPER10003954,@41.504/-81.608,'Case Western Reserve University'@en,'Case Comprehensive Cancer Center'@en,@41.504/-81.608,0.000000
1,,cluster_0_0_2_2_14_112_322,member,QNIHPER10428590,@37.5625/126.945,'Ewha Womans University'@en,'Severance Hospital'@en,@37.5624/126.941,0.000016
2,,cluster_0_0_3_6_21_40_1800,member,QNIHPER10004092,@44.564588/-123.275705,'Oregon State University'@en,'Good Samaritan Regional Medical Center'@en,@44.6022/-123.252,0.001977
3,,cluster_0_0_3_6_21_40_1238,member,QNIHPER12571233,@42.447222222222/-76.483055555556,'Cornell University'@en,'Cayuga Medical Center'@en,@42.469273/-76.53727,0.003425
4,,cluster_0_0_3_6_21_40_1238,member,QNIHPER2098890,@46.343224/-119.276333,'Pacific Northwest National Laboratory'@en,'Kadlec Clinic'@en,@46.28146/-119.281346,0.003840
...,...,...,...,...,...,...,...,...,...
2377,,cluster_0_0_2_2_14_8_259,member,QNIHPER9634058,@35.908611111111/-79.049166666667,'University of North Carolina at Chapel Hill'@en,'University of North Carolina Hospitals'@en,@35.90515/-79.050011,0.000013
2378,,cluster_0_0_3_6_7_2_843,member,QNIHPER9728962,@38.907222222222/-77.072777777778,'Georgetown University'@en,'Georgetown University Medical Center'@en,@38.912/-77.077,0.000041
2379,,cluster_0_0_3_6_7_2_2378,member,QNIHPER9883306,@38.912/-77.077,'Georgetown University Medical Center'@en,'Georgetown University Medical Center'@en,@38.912/-77.077,0.000000
2380,,cluster_0_0_3_6_21_77_1289,member,QNIHPER9807979,@40.73/-73.995,'New York University'@en,'St. John\'s Episcopal Hospital'@en,@40.725656/-73.991794,0.000029


In [43]:
nih['lat'] = -1.1
nih['long'] = -1.1

In [44]:
for i in range(0, len(nih)):
    nih.lat[i] = float(nih['node2;coor'][i][1:].split('/')[0])
    nih.long[i] = float(nih['node2;coor'][i][1:].split('/')[1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
nih.to_excel('nih.xlsx', index = False)

## Output Tableau files to show main topics (subjects) of papers for each organization

In [50]:
!$kypher \
-i "/Users/grantxie/Downloads/nih_projects/nih_project.id.tsv" --as project_subject\
--limit 10

[2021-12-14 02:08:37 sqlstore]: IMPORT graph directly into table graph_64 from /Users/grantxie/Downloads/nih_projects/nih_project.id.tsv ...
[2021-12-14 02:08:42 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_64 AS graph_64_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2
QNIHPRO10286324-PNIHtitle-cfa7f1	QNIHPRO10286324	PNIHtitle	"CD38-TARGETED IMMUNOPET OF MYELOMA: PHASE 2 TRIAL OF CLINICAL APPLICATIONS"
QNIHPRO10286324-PNIHorg-17451a	QNIHPRO10286324	PNIHorg	"HOAG MEMORIAL HOSPITAL PRESBYTERIAN"
QNIHPRO10286324-PNIHcost-452920	QNIHPRO10286324	PNIHcost	598926
QNIHPRO10286324-PNIHsub-730793	QNIHPRO10286324	PNIHsub	"Antibodies; base; Biopsy; blind; Blood Tests; Bone marrow biopsy; burden of illness; cancer imaging; Cells; Clinical; clinical application; clinical care; Clinical Trials; clinically significant; Combined Modality Therapy; design; Detection; Detection of Minimal Residual Disease

In [53]:
!$kypher -i author -i nih_author -i inv -i item -i coor -i label -i project -i hosp -i time -i quantities\
--match 'author: (cluster)-[]->(qnode), nih_author:(qnihauthor)-[]->(qnode), inv:(project)-[]->(qnihauthor), \
        item:(qnode)-[:P108]->(org_node), label:(org_node)-[]->(org_name),label:(qnode)-[]->(author_name), \
        coor:(org_node)-[:P625]->(organization_coor), hosp:(qnihauthor)-[]->(hosp_node),\
        label:(hosp_node)-[]->(hosp_name), coor:(hosp_node)-[:P625]->(hosp_coor),item:(paper)-[:P50]->(qnode),item:(paper)-[:P921]->(mai)'\
--opt 'time: (hosp_node)-[:P571]->(hosp_inc)' \
--opt 'time: (org_node)-[:P571]->(org_inc)' \
--opt 'quantities: (hosp_node)-[:P6801]->(count_beds)' \
--opt 'project: (project)-[:PNIHcost]->(cost)' \
--return 'cluster as cluster_id, org_name as org_name, sum(cost) as organization_award, \
        kgtk_geo_coords_lat(organization_coor) as org_lat, kgtk_geo_coords_long(organization_coor) as org_long,\
        kgtk_date_year(org_inc) as org_inc, \
        hosp_name as hosp_name, kgtk_geo_coords_lat(hosp_coor) as hosp_lat, kgtk_geo_coords_long(hosp_coor) as hosp_long,\
        kgtk_date_year(hosp_inc) as hosp_inc, count_beds as count_beds' \
-o bed_inception_1.tsv

[2021-12-14 02:10:29 query]: SQL Translation:
---------------------------------------------
  SELECT graph_19_c1."node1" "_aLias.cluster_id", graph_18_c5."node2" "_aLias.org_name", sum(graph_31_c16."node2") "_aLias.organization_award", kgtk_geo_coords_lat(graph_22_c7."node2") "_aLias.org_lat", kgtk_geo_coords_long(graph_22_c7."node2") "_aLias.org_long", kgtk_date_year(graph_35_c14."node2") "_aLias.org_inc", graph_18_c9."node2" "_aLias.hosp_name", kgtk_geo_coords_lat(graph_22_c10."node2") "_aLias.hosp_lat", kgtk_geo_coords_long(graph_22_c10."node2") "_aLias.hosp_long", kgtk_date_year(graph_35_c13."node2") "_aLias.hosp_inc", graph_38_c15."node2" "_aLias.count_beds"
     FROM graph_18 AS graph_18_c5
     INNER JOIN graph_18 AS graph_18_c6, graph_18 AS graph_18_c9, graph_19 AS graph_19_c1, graph_21 AS graph_21_c11, graph_21 AS graph_21_c12, graph_21 AS graph_21_c4, graph_22 AS graph_22_c10, graph_22 AS graph_22_c7, graph_30 AS graph_30_c2, graph_32 AS graph_32_c3, graph_37 AS graph_37_c8
 

In [52]:
!$kypher -i author_test_new -i nih_author -i inv -i item -i coor -i label -i project -i hosp -i time -i quantities -i project_subject\
--match 'author: (cluster)-[]->(qnode), nih_author:(qnihauthor)-[]->(qnode), inv:(project)-[]->(qnihauthor), \
        item:(qnode)-[:P108]->(org_node), label:(org_node)-[]->(org_name),label:(qnode)-[]->(author_name), \
        coor:(org_node)-[:P625]->(organization_coor), project_subject:(project)-[]->(subject), hosp:(qnihauthor)-[]->(hosp_node),\
        label:(hosp_node)-[]->(hosp_name), coor:(hosp_node)-[:P625]->(hosp_coor)'\
--opt 'project_subject: (project)-[:PNIHcost]->(cost)' \
--opt 'project_subject: (project)-[:PNIHtitle]->(project_title)' \
--opt 'project_subject: (project)-[:PNIHsub]->(project_subjects)' \
--return 'subject as subject, \
            count(subject) as subject_count, sum(cost) as cost' \
--order-by "subject_count desc"\
-o subjects_total.tsv

[2021-12-14 02:09:56 query]: SQL Translation:
---------------------------------------------
  SELECT graph_64_c8."node2" "_aLias.subject", count("_aLias.subject") "_aLias.subject_count", sum(graph_64_c12."node2") "_aLias.cost"
     FROM graph_18 AS graph_18_c10
     INNER JOIN graph_18 AS graph_18_c5, graph_18 AS graph_18_c6, graph_21 AS graph_21_c4, graph_22 AS graph_22_c11, graph_22 AS graph_22_c7, graph_30 AS graph_30_c2, graph_32 AS graph_32_c3, graph_37 AS graph_37_c9, graph_63 AS graph_63_c1, graph_64 AS graph_64_c8
     ON graph_21_c4."node2" = graph_18_c5."node1"
        AND graph_21_c4."node2" = graph_22_c7."node1"
        AND graph_30_c2."node1" = graph_32_c3."node2"
        AND graph_30_c2."node1" = graph_37_c9."node1"
        AND graph_32_c3."node1" = graph_64_c8."node1"
        AND graph_37_c9."node2" = graph_18_c10."node1"
        AND graph_37_c9."node2" = graph_22_c11."node1"
        AND graph_63_c1."node2" = graph_18_c6."node1"
        AND graph_63_c1."node2" = graph_21

In [14]:
df = pd.read_csv('file.tsv', sep = '\t')

In [15]:
df['count_beds'].fillna(value=df['count_beds'].mean(), inplace=True)

In [17]:
df.to_excel('nih_0824.xlsx', index = False)

In [18]:
cluster = []
org_name = []
org_node = []
org_lat = []
org_long = []
hosp_name = []
hosp_node = []
hosp_lat =[]
hosp_long= []

subject1 = []
subject2 = []
subject3 = []
subject4 = []
subject5 = []

subject1_count = []
subject2_count = []
subject3_count = []
subject4_count = []
subject5_count = []

subject1_cost  = []
subject2_cost  = []
subject3_cost  = []
subject4_cost  = []
subject5_cost  = []

org_inc = []
hosp_inc = []

hosp_bed = []

last_cluster = ''
last_org = ''

for i in range(0, len(df)):
    
    if df.cluster_id[i] == last_cluster and df['org_name'][i] == last_org:
        continue
    else:
        last_cluster = df.cluster_id[i]
        last_org = df['org_name'][i]
        last_cluster = df.cluster_id[i]
        last_org = df['org_name'][i]
        cluster.append(df['cluster_id'][i])
        org_name.append(df['org_name'][i])
        #org_node.append(df['org_node'][i])
        org_lat.append(df['org_lat'][i])
        org_long.append(df['org_long'][i])
        hosp_name.append(df['hosp_name'][i])
        #hosp_node.append(df['hosp_node'][i])
        hosp_lat.append(df['hosp_lat'][i])
        hosp_long.append(df['hosp_long'][i])
        
        
        subject1.append(df.ms_name[i])
        subject2.append(df.ms_name[i+1])
        subject3.append(df.ms_name[i+2])
        subject4.append(df.ms_name[i+3])
        subject5.append(df.ms_name[i+4])
        
        subject1_count.append(df.subject_count[i])
        subject2_count.append(df.subject_count[i+1])
        subject3_count.append(df.subject_count[i+2])
        subject4_count.append(df.subject_count[i+3])
        subject5_count.append(df.subject_count[i+4])
        
        
        #subject1_cost.append(df.cost[i])
        #subject2_cost.append(df.cost[i+1])
        #subject3_cost.append(df.cost[i+2])
        #subject4_cost.append(df.cost[i+3])
        #subject5_cost.append(df.cost[i+4])
        
        org_inc.append(df.org_inc[i])
        hosp_inc.append(df.hosp_inc[i])
        hosp_bed.append(df.count_beds[i])
    

In [326]:
output = pd.DataFrame(list(zip(cluster, org_name,  org_lat, org_long, org_inc, hosp_name,  hosp_lat, hosp_long, hosp_inc, hosp_bed, subject1, \
subject2, subject3, subject4, subject5, subject1_count,subject2_count,subject3_count,subject4_count,subject5_count)), columns = ['cluster', 'org_name', 'org_lat', 'org_long', 'org_inc', 'hosp_name',  'hosp_lat', 'hosp_long', 'hosp_inc', 'hosp_bed', 'subject1', \
'subject2', 'subject3', 'subject4', 'subject5', 'count1', 'count2','count3','count4','count5'])

In [327]:

output['hosp_bed'].fillna(value=output['hosp_bed'].mean(), inplace=True)

In [328]:
output

Unnamed: 0,cluster,org_name,org_lat,org_long,org_inc,hosp_name,hosp_lat,hosp_long,hosp_inc,hosp_bed,subject1,subject2,subject3,subject4,subject5,count1,count2,count3,count4,count5
0,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,632.616279,'phosphorylation'@en,'CRISPR'@en,'RNA sequencing'@en,'database'@en,'Toll-like receptor'@en,3570,3330,2028,2028,1620
1,cluster_0_0_0_3_12_19_26_39_990,'Yale University'@en,41.311111,-72.926667,1701.0,'UCSF Medical Center'@en,37.763200,-122.458000,1907.0,796.000000,'CRISPR'@en,'RNA sequencing'@en,'database'@en,'Toll-like receptor'@en,'statistics'@en,3330,2028,2028,1620,1404
2,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,632.616279,'RNA sequencing'@en,'database'@en,'Toll-like receptor'@en,'statistics'@en,'Caenorhabditis elegans'@en,2028,2028,1620,1404,1248
3,cluster_0_0_0_3_12_20_131_236_1292,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,632.616279,'Toll-like receptor'@en,'statistics'@en,'Caenorhabditis elegans'@en,'transcriptome'@en,'macromolecule'@en,1620,1404,1248,1248,1092
4,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,632.616279,'statistics'@en,'Caenorhabditis elegans'@en,'transcriptome'@en,'macromolecule'@en,'ribozyme'@en,1404,1248,1248,1092,945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,cluster_0_0_0_3_12_20_27_233_494,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.000000,'Human Induced Pluripotent Stem Cells'@en,'autism'@en,'transcriptome'@en,'CRISPR'@en,'cell biology'@en,66,66,66,44,44
1016,cluster_0_1_1_12_29_57_94_232_1261,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.000000,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,'Chemical modulation of chaperone-mediated aut...,'Fanconi syndrome'@en,'autoinhibition'@en,42,42,42,42,42
1017,cluster_0_1_1_12_27_75_124_219_527,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.000000,'ovarian cancer'@en,'ATRX chromatin remodeler'@en,"'ATRX, chromatin remodeler'@en",'Caenorhabditis elegans'@en,'DNA methylation'@en,30,22,22,22,22
1018,cluster_0_0_0_3_12_20_27_233_494,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.000000,'ATRX chromatin remodeler'@en,"'ATRX, chromatin remodeler'@en",'Caenorhabditis elegans'@en,'DNA methylation'@en,'DNA-binding E3 ubiquitin-protein ligase SNT2 ...,22,22,22,22,22


In [329]:
output.to_excel('top5_subject_0820.xlsx', index = False)

In [None]:
!$kypher -i item -i nih_author -i item -i label\
--match 'item:(paper)-[:P921]->(ms),\
        item:(paper)-[:P50]->(qnode),item:(paper)-[:P921]->(ms_node),\
        label:(ms_node)-[]->(ms_name)'\
--return 'paper,ms, ms_name'\
--limit 100

In [271]:
!$kypher -i item -i nih_author -i item -i label -i author\
--match 'author: ()-[]->(qnode),item:(paper)-[:P50]->(qnode)'\
--return 'distinct "" as id, qnode as node1, "P50" as label, paper as node2'\
-o paper.tsv

[2021-08-20 10:29:24 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT ? "_aLias.id", graph_30_c1."node2" "_aLias.node1", ? "_aLias.label", graph_21_c2."node1" "_aLias.node2"
     FROM graph_21 AS graph_21_c2
     INNER JOIN graph_30 AS graph_30_c1
     ON graph_30_c1."node2" = graph_21_c2."node2"
        AND graph_21_c2."label" = ?
  PARAS: ['', 'P50', 'P50']
---------------------------------------------


In [272]:
!$kgtk add-id --id-style wikidata -i "/Users/grantxie/Downloads/NIH/paper.tsv" -o paper.id.tsv

In [273]:
!$kypher \
-i "paper.id.tsv" --as paper_test_new\
--limit 10

[2021-08-20 10:31:34 sqlstore]: IMPORT graph directly into table graph_44 from /Users/grantxie/Downloads/NIH/paper.id.tsv ...
[2021-08-20 10:31:35 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_44 AS graph_44_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2
Q100287043-P50-Q27316717	Q100287043	P50	Q27316717
Q100287043-P50-Q28728777	Q100287043	P50	Q28728777
Q100287043-P50-Q33579346	Q100287043	P50	Q33579346
Q100287043-P50-Q34769869	Q100287043	P50	Q34769869
Q100287043-P50-Q36126966	Q100287043	P50	Q36126966
Q100287043-P50-Q36950067	Q100287043	P50	Q36950067
Q100287043-P50-Q38741540	Q100287043	P50	Q38741540
Q100287043-P50-Q38751767	Q100287043	P50	Q38751767
Q100287043-P50-Q44000882	Q100287043	P50	Q44000882
Q100287043-P50-Q54264722	Q100287043	P50	Q54264722


In [281]:

!$kypher -i item -i nih_author -i item -i label -i test -i paper_test\
--match 'paper_test:()-[]->(:Q27316717)'\


[2021-08-20 10:35:22 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_43 AS graph_43_c1
     WHERE graph_43_c1."node2" = ?
  PARAS: ['Q27316717']
---------------------------------------------
id	node1	label	node2


In [302]:
!$kypher -i item -i nih_author -i item -i label -i test -i paper_test_new -i author_test_new\
--match 'author_test_new: ()-[]->(qnode), paper_test_new:(qnode)-[:P50]->(paper),item:(paper)-[:P921]->(ms_node),\
        label:(ms_node)-[]->(ms_name), label:(qnode)-[]->(name)'\
--return 'distinct name, ms_name, count(ms_node)'\
--order 'count(ms_node) desc'\
-o test4.tsv

[2021-08-20 10:50:28 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_18_c5."node2", graph_18_c4."node2", count(graph_21_c3."node2")
     FROM graph_18 AS graph_18_c4
     INNER JOIN graph_18 AS graph_18_c5, graph_21 AS graph_21_c3, graph_44 AS graph_44_c2, graph_45 AS graph_45_c1
     ON graph_21_c3."node2" = graph_18_c4."node1"
        AND graph_44_c2."node2" = graph_21_c3."node1"
        AND graph_45_c1."node2" = graph_18_c5."node1"
        AND graph_45_c1."node2" = graph_44_c2."node1"
        AND graph_21_c3."label" = ?
        AND graph_44_c2."label" = ?
     GROUP BY graph_18_c5."node2", graph_18_c4."node2"
     ORDER BY count(graph_21_c3."node2") DESC
  PARAS: ['P921', 'P50']
---------------------------------------------
[2021-08-20 10:50:28 sqlstore]: CREATE INDEX on table graph_45 column node2 ...
[2021-08-20 10:50:28 sqlstore]: ANALYZE INDEX on table graph_45 column node2 ...


In [298]:
!$kypher -i item -i nih_author -i item -i label -i paper_test_new\
--match 'author: (a)-[]->(b)'\
--return 'distinct "" as id, a as node1, "same" as label, b as node2'\
-o author_kgtk_distinct.tsv

[2021-08-20 10:47:18 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT ? "_aLias.id", graph_30_c1."node1" "_aLias.node1", ? "_aLias.label", graph_30_c1."node2" "_aLias.node2"
     FROM graph_30 AS graph_30_c1
  PARAS: ['', 'same']
---------------------------------------------


In [300]:
!$kgtk add-id --id-style wikidata -i "/Users/grantxie/Downloads/NIH/author_kgtk_distinct.tsv" -o author_kgtk_distinct.id.tsv

In [301]:
!$kypher \
-i "author_kgtk_distinct.id.tsv" --as author_test_new\
--limit 10

[2021-08-20 10:48:48 sqlstore]: DROP graph data table graph_45 from author_test_new
[2021-08-20 10:48:48 sqlstore]: IMPORT graph directly into table graph_45 from /Users/grantxie/Downloads/NIH/author_kgtk_distinct.id.tsv ...
[2021-08-20 10:48:48 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_45 AS graph_45_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2
QNIHPER1940486-same-Q6831539	QNIHPER1940486	same	Q6831539
QNIHPER2414340-same-Q87796696	QNIHPER2414340	same	Q87796696
QNIHPER11537130-same-Q89958961	QNIHPER11537130	same	Q89958961
QNIHPER8820221-same-Q61467477	QNIHPER8820221	same	Q61467477
QNIHPER8491201-same-Q46001993	QNIHPER8491201	same	Q46001993
QNIHPER7882742-same-Q89866816	QNIHPER7882742	same	Q89866816
QNIHPER8146532-same-Q89937644	QNIHPER8146532	same	Q89937644
QNIHPER10499986-same-Q56809498	QNIHPER10499986	same	Q56809498
QNIHPER2315018-same-Q87887672	QNIHPER2315018	same	Q87887672
QN

In [220]:
!$kypher \
-i "/Users/grantxie/test_kgtk.tsv" --as test\
--limit 10

[2021-08-20 05:55:27 sqlstore]: IMPORT graph via csv.reader into table graph_41 from /Users/grantxie/test_kgtk.tsv ...
[2021-08-20 05:55:27 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_41 AS graph_41_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2	row	node2;label	graph
author0	QNIHPER1940486	label	Q6831539	2	MICHAEL J GALE	author
author1	QNIHPER2414340	label	Q87796696	5	PATRICK S. STAYTON	author
author2	QNIHPER11537130	label	Q89958961	8	KENNETH NGURE	author
author3	QNIHPER8820221	label	Q61467477	10	ANITHA PASUPATHY	author


In [221]:
!$kypher -i item -i nih_author -i item -i label -i test\
--match 'test: ()-[]->(qnode),item:(paper)-[:P50]->(qnode),item:(paper)-[:P921]->(ms_node),\
        label:(ms_node)-[]->(ms_name), label:(qnode)-[]->(name)'\
--return 'distinct name, ms_name, count(ms_node)'\
--order 'count(ms_node) desc'\
-o test1.tsv

[2021-08-20 05:55:52 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_18_c5."node2", graph_18_c4."node2", count(graph_21_c3."node2")
     FROM graph_18 AS graph_18_c4
     INNER JOIN graph_18 AS graph_18_c5, graph_21 AS graph_21_c2, graph_21 AS graph_21_c3, graph_41 AS graph_41_c1
     ON graph_21_c2."node1" = graph_21_c3."node1"
        AND graph_21_c3."node2" = graph_18_c4."node1"
        AND graph_41_c1."node2" = graph_18_c5."node1"
        AND graph_41_c1."node2" = graph_21_c2."node2"
        AND graph_21_c2."label" = ?
        AND graph_21_c3."label" = ?
     GROUP BY graph_18_c5."node2", graph_18_c4."node2"
     ORDER BY count(graph_21_c3."node2") DESC
  PARAS: ['P50', 'P921']
---------------------------------------------
[2021-08-20 05:55:52 sqlstore]: CREATE INDEX on table graph_41 column node2 ...
[2021-08-20 05:55:52 sqlstore]: ANALYZE INDEX on table graph_41 column node2 ...


## file for tableau display

In [44]:
!$kypher -i author -i nih_author -i inv -i item -i coor -i label -i project -i hosp -i time -i quantities -i author_test_new -i paper_test_new -i nih0825_new\
--match 'author: (cluster)-[]->(qnode), nih_author:(qnihauthor)-[]->(qnode), inv:(project)-[]->(qnihauthor), \
        item:(qnode)-[:P108]->(org_node), label:(org_node)-[]->(org_name),label:(qnode)-[]->(author_name), \
        coor:(org_node)-[:P625]->(organization_coor), hosp:(qnihauthor)-[]->(hosp_node),\
        label:(hosp_node)-[]->(hosp_name), coor:(hosp_node)-[:P625]->(hosp_coor),author_test_new: ()-[]->(qnode), paper_test_new:(qnode)-[:P50]->(paper),item:(paper)-[:P921]->(ms_node),\
        label:(ms_node)-[]->(ms_name), label:(qnode)-[]->(name), nih0825_new:(project)-[:PNIHstart]->(start1)'\
--opt 'time: (hosp_node)-[:P571]->(hosp_inc)' \
--opt 'time: (org_node)-[:P571]->(org_inc)' \
--opt 'quantities: (hosp_node)-[:P6801]->(count_beds)' \
--opt 'nih0825_new: (project)-[:PNIHcost]->(cost)' \
--return 'cluster as cluster_id, org_name as org_name, \
        kgtk_geo_coords_lat(organization_coor) as org_lat, kgtk_geo_coords_long(organization_coor) as org_long,\
        kgtk_date_year(org_inc) as org_inc, \
        hosp_name as hosp_name, kgtk_geo_coords_lat(hosp_coor) as hosp_lat, kgtk_geo_coords_long(hosp_coor) as hosp_long,\
        kgtk_date_year(hosp_inc) as hosp_inc, count_beds as count_beds, ms_name as ms_name, count(ms_node) as subject_count, sum(cost) as cost, start1 as start' \
--order 'org_name desc, subject_count desc'\
-o final.tsv

[2021-08-26 03:00:17 query]: SQL Translation:
---------------------------------------------
  SELECT graph_19_c1."node1" "_aLias.cluster_id", graph_18_c5."node2" "_aLias.org_name", kgtk_geo_coords_lat(graph_22_c7."node2") "_aLias.org_lat", kgtk_geo_coords_long(graph_22_c7."node2") "_aLias.org_long", kgtk_date_year(graph_35_c18."node2") "_aLias.org_inc", graph_18_c9."node2" "_aLias.hosp_name", kgtk_geo_coords_lat(graph_22_c10."node2") "_aLias.hosp_lat", kgtk_geo_coords_long(graph_22_c10."node2") "_aLias.hosp_long", kgtk_date_year(graph_35_c17."node2") "_aLias.hosp_inc", graph_38_c19."node2" "_aLias.count_beds", graph_18_c14."node2" "_aLias.ms_name", count(graph_21_c13."node2") "_aLias.subject_count", sum(graph_49_c20."node2") "_aLias.cost", graph_49_c16."node2" "_aLias.start"
     FROM graph_18 AS graph_18_c14
     INNER JOIN graph_18 AS graph_18_c15, graph_18 AS graph_18_c5, graph_18 AS graph_18_c6, graph_18 AS graph_18_c9, graph_19 AS graph_19_c1, graph_21 AS graph_21_c13, graph_21 AS

In [43]:
!$kypher \
-i "/Users/grantxie/Downloads/kgtk-files-nih-V5.0/nih_project_0825.id.tsv" --as nih0825_new\
--limit 10

[2021-08-26 02:59:59 sqlstore]: IMPORT graph directly into table graph_49 from /Users/grantxie/Downloads/kgtk-files-nih-V5.0/nih_project_0825.id.tsv ...
[2021-08-26 03:00:11 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_49 AS graph_49_c1
     LIMIT ?
  PARAS: [10]
---------------------------------------------
id	node1	label	node2
QNIHPRO10286324-PNIHtitle-cfa7f1	QNIHPRO10286324	PNIHtitle	"CD38-TARGETED IMMUNOPET OF MYELOMA: PHASE 2 TRIAL OF CLINICAL APPLICATIONS"
QNIHPRO10286324-PNIHstart-f85450	QNIHPRO10286324	PNIHstart	"02/01/2021"
QNIHPRO10286324-PNIHorg-QNIHORG205901	QNIHPRO10286324	PNIHorg	QNIHORG205901
QNIHPRO10286324-PNIHcost-452920	QNIHPRO10286324	PNIHcost	598926
QNIHPRO10286324-PNIHsub-730793	QNIHPRO10286324	PNIHsub	"Antibodies; base; Biopsy; blind; Blood Tests; Bone marrow biopsy; burden of illness; cancer imaging; Cells; Clinical; clinical application; clinical care; Clinical Trials; clinically significant; Combined Modalit

In [62]:
test = pd.read_csv('final.tsv', sep ='\t')

In [63]:
test

Unnamed: 0,cluster_id,org_name,org_lat,org_long,org_inc,hosp_name,hosp_lat,hosp_long,hosp_inc,count_beds,ms_name,subject_count,cost,start
0,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'phosphorylation'@en,5100,3583893760,06/01/2021
1,cluster_0_0_0_3_12_19_26_39_990,'Yale University'@en,41.311111,-72.926667,1701.0,'UCSF Medical Center'@en,37.763200,-122.458000,1907.0,796.0,'CRISPR'@en,4662,3295411734,06/01/2021
2,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'RNA sequencing'@en,3198,3663737922,04/01/2021
3,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'database'@en,3198,3663737922,04/01/2021
4,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'statistics'@en,2214,2536433946,04/01/2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15238,cluster_0_1_1_12_27_75_124_219_527,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.0,'multiple drug resistance'@en,16,5784638,07/01/2017
15239,cluster_0_1_1_12_27_75_124_219_527,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.0,'phosphorylation'@en,16,5784638,07/01/2017
15240,cluster_0_1_1_12_27_75_124_219_527,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.0,'substance dependence'@en,16,5784638,07/01/2017
15241,cluster_0_1_1_12_27_75_124_219_527,'Albert Einstein College of Medicine'@en,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",40.847986,-73.843853,1899.0,225.0,'team science'@en,16,5784638,07/01/2017


In [46]:
for i in range(0, len(df)):
    df.start[i] = df.start[i][len(df.start[i])-4:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
df['count_beds'].fillna(value=df['count_beds'].mean(), inplace=True)

In [49]:
df.to_excel('nih_0826_new2.xlsx', index = False)

In [12]:
for i in range(0, len(df)):
    df.start[i] = df.start[i][len(df.start[i])-4:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
df.head()

Unnamed: 0,cluster_id,org_name,org_lat,org_long,org_inc,hosp_name,hosp_lat,hosp_long,hosp_inc,count_beds,ms_name,subject_count,cost,start
0,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'phosphorylation'@en,850,554722240.0,2021
1,cluster_0_0_0_3_12_20_131_236_1292,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'Toll-like receptor'@en,360,121399080.0,2021
2,cluster_0_0_0_3_12_19_26_39_990,'Yale University'@en,41.311111,-72.926667,1701.0,'UCSF Medical Center'@en,37.7632,-122.458,1907.0,796.0,'CRISPR'@en,222,460578516.0,2021
3,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'corpus striatum'@en,180,117470592.0,2021
4,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,41.304278,-72.935676,2009.0,,'dopamine'@en,160,104418304.0,2021


In [14]:

df['count_beds'].fillna(value=df['count_beds'].mean(), inplace=True)

In [15]:
df.to_excel('nih_0825.xlsx', index = False)

## Adding internet coverage by county to map background


In [10]:
import pandas as pd
df = pd.read_csv('countycode.tsv', sep = '\t')
df.to_excel('FIPS.xlsx')

In [11]:
df['FIPS'] = ''

In [12]:
cov = pd.read_csv('/Users/grantxie/Downloads/broadband_data_2020October.csv')

In [13]:
cov = cov[18:]

In [14]:
cov = cov.reset_index(drop = True)


In [58]:
cov.head()

Unnamed: 0,Data is to be used only for analysis purposes related to broadband mapping,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,AL,1001,Autauga County,0.8057,0.391
1,AL,1003,Baldwin County,0.8362,0.452
2,AL,1005,Barbour County,0.6891,0.324
3,AL,1007,Bibb County,0.3368,0.136
4,AL,1009,Blount County,0.758,0.199


In [77]:
cov

Unnamed: 0,Data is to be used only for analysis purposes related to broadband mapping,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,AL,1001,Autauga County,0.8057,0.391
1,AL,1003,Baldwin County,0.8362,0.452
2,AL,1005,Barbour County,0.6891,0.324
3,AL,1007,Bibb County,0.3368,0.136
4,AL,1009,Blount County,0.758,0.199
...,...,...,...,...,...
3137,WY,56037,Sweetwater County,0.9422,0.4
3138,WY,56039,Teton County,0.9508,0.623
3139,WY,56041,Uinta County,0.9963,0.431
3140,WY,56043,Washakie County,0.8903,0.571


In [78]:
for i in range(0, len(cov)):
    if len(cov['Unnamed: 1'][i]) == 4:
        cov['Unnamed: 1'][i] = '0' + str(cov['Unnamed: 1'][i])

In [60]:
for i in range(0, len(df)):
    df['n1'][i] = cov['Unnamed: 3'][i]
    df['n2'][i] = cov['Unnamed: 4'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [74]:
df

Unnamed: 0,county,qnode,code,FIPS,n1,n2
0,Autauga County,Q156168,1001,01001,0.8057,0.391
1,Baldwin County,Q156163,1003,01003,0.8362,0.452
2,Barbour County,Q109437,1005,01005,0.6891,0.324
3,Bibb County,Q461204,1007,01007,0.3368,0.136
4,Blount County,Q111250,1009,01009,0.758,0.199
...,...,...,...,...,...,...
3124,Sweetwater County,Q484194,56037,56037,0.8503,0.21
3125,Teton County,Q488912,56039,56039,0.7941,0.373
3126,Uinta County,Q483973,56041,56041,0.9972,0.198
3127,Washakie County,Q112846,56043,56043,0.8722,0.381


In [85]:
out

Unnamed: 0,cluster,org_name,org_node,org_lat,org_long,org_inc,hosp_name,hosp_node,hosp_lat,hosp_long,hosp_inc,hosp_bed,subject1,subject2,subject3,subject4,subject5,FIPS,n1,n2
0,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'phosphorylation'@en,'Toll-like receptor'@en,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,01001,0.8057,0.391
1,cluster_0_0_0_3_12_20_131_236_1292,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'Toll-like receptor'@en,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,01003,0.8362,0.452
2,cluster_0_0_0_3_12_19_26_39_990,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'UCSF Medical Center'@en,Q7864122,37.763200,-122.458000,1907.0,796.000000,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,'database'@en,01005,0.6891,0.324
3,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,'database'@en,'Alzheimer\'s disease'@en,01007,0.3368,0.136
4,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'RNA sequencing'@en,'database'@en,'Alzheimer\'s disease'@en,'statistics'@en,'inflammation'@en,01009,0.758,0.199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,cluster_0_0_0_3_12_20_27_233_494,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'RNA sequencing'@en,'apoptotic process'@en,'molecular chaperones'@en,'schizophrenia'@en,'BH3 interacting domain death agonist'@en,17045,0.9221,0.173
618,cluster_0_1_1_12_29_57_94_232_1261,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'apoptotic process'@en,'molecular chaperones'@en,'schizophrenia'@en,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,17047,0.7801,0.188
619,cluster_0_0_0_3_12_20_27_233_494,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'schizophrenia'@en,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,'Chemical modulation of chaperone-mediated aut...,'Fanconi syndrome'@en,17049,0.9528,0.586
620,cluster_0_1_1_12_29_57_94_232_1261,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,'Chemical modulation of chaperone-mediated aut...,'Fanconi syndrome'@en,'autoinhibition'@en,17051,0.5575,0.198


In [89]:
row = pd.DataFrame([[np.nan] * len(out.columns)], columns=out.columns)
out = out.append(row, ignore_index=True)

In [106]:
for i in range(0, 3000):
    out = out.append(row, ignore_index=True)
    

In [99]:
out = pd.read_excel('/Users/grantxie/Downloads/NIH/top5_subject.xlsx')

In [100]:
out['FIPS'] = ''
out['n1'] = 0.0
out['n2'] = 0.0

In [101]:
out['county'] = ''

In [102]:
out.append(pd.Series(), ignore_index=True)
out

  """Entry point for launching an IPython kernel.


Unnamed: 0,cluster,org_name,org_node,org_lat,org_long,org_inc,hosp_name,hosp_node,hosp_lat,hosp_long,...,subject5_count,subject1_cost,subject2_cost,subject3_cost,subject4_cost,subject5_cost,FIPS,n1,n2,county
0,cluster_0_1_3_11_25_80_153_279_1164,"'University of California, Los Angeles'@en",Q174710,34.072222,-118.444167,1919.0,'Spaulding Hospital'@en,Q30270420,42.375748,-71.107052,...,24200,1.938386e+11,1.057301e+11,8.268639e+10,1.626617e+10,1.491066e+10,,0.0,0.0,
1,cluster_0_1_3_11_25_80_153_279_1164,'Harvard University'@en,Q13371,42.374444,-71.116944,1636.0,'Spaulding Hospital'@en,Q30270420,42.375748,-71.107052,...,66024,1.938386e+11,1.057301e+11,1.803773e+12,8.268639e+10,7.284469e+11,,0.0,0.0,
2,cluster_0_0_2_2_11_1_15_90_574,'University of Texas MD Anderson Cancer Center...,Q1525831,29.707800,-95.397500,1941.0,'University of Texas MD Anderson Cancer Center...,Q1525831,29.707800,-95.397500,...,28998,1.482875e+11,7.544451e+10,7.544451e+10,6.763990e+10,2.341381e+10,,0.0,0.0,
3,cluster_0_1_1_12_27_75_144_358_1672,'Harvard University'@en,Q13371,42.374444,-71.116944,1636.0,'Spaulding Hospital'@en,Q30270420,42.375748,-71.107052,...,44016,1.803773e+12,8.268639e+10,7.284469e+11,5.203192e+11,4.856313e+11,,0.0,0.0,
4,cluster_0_1_4_7_21_36_48_78_451,'University of Pittsburgh Graduate School of P...,Q7896126,40.442600,-79.958200,1948.0,'Center for Emergency Medicine of Western Penn...,Q5059612,40.439200,-79.958900,...,64240,6.762670e+10,4.830478e+10,3.355911e+10,2.898287e+10,2.796593e+10,,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,cluster_0_1_3_9_28_49_65_104_870,'University of Washington'@en,Q219563,47.654167,-122.308056,1861.0,'University of Washington Medical Center'@en,Q7896576,47.648923,-122.306595,...,90,3.228399e+07,3.228399e+07,3.228399e+07,3.228399e+07,3.228399e+07,,0.0,0.0,
618,cluster_0_1_3_11_25_81_158_385_1488,'Harvard University'@en,Q13371,42.374444,-71.116944,1636.0,'UCSF Medical Center'@en,Q7864122,37.763200,-122.458000,...,65,5.368876e+07,5.368876e+07,5.368876e+07,5.368876e+07,5.368876e+07,,0.0,0.0,
619,cluster_0_1_3_11_25_81_158_385_1488,"'University of California, San Francisco'@en",Q1061104,37.763319,-122.458539,1873.0,'UCSF Medical Center'@en,Q7864122,37.763200,-122.458000,...,65,5.368876e+07,5.368876e+07,5.368876e+07,5.368876e+07,5.368876e+07,,0.0,0.0,
620,cluster_0_0_2_1_19_31_45_398_1684,"'University of California, San Diego'@en",Q622664,32.881000,-117.238000,1960.0,'Jacobs Medical Center'@en,Q38250501,32.877703,-117.226499,...,314600,8.909118e+06,8.909118e+06,8.909118e+06,8.909118e+06,1.938386e+11,,0.0,0.0,


In [107]:
for i in range(0, len(df)):
    out['FIPS'][i] = df['FIPS'][i]
    out['n1'][i] =df['n1'][i]
    out['n2'][i] = df['n2'][i]
    out['county'][i] = df['county'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [94]:
out

Unnamed: 0,cluster,org_name,org_node,org_lat,org_long,org_inc,hosp_name,hosp_node,hosp_lat,hosp_long,hosp_inc,hosp_bed,subject1,subject2,subject3,subject4,subject5,FIPS,n1,n2
0,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'phosphorylation'@en,'Toll-like receptor'@en,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,01001,0.8057,0.391
1,cluster_0_0_0_3_12_20_131_236_1292,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'Toll-like receptor'@en,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,01003,0.8362,0.452
2,cluster_0_0_0_3_12_19_26_39_990,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'UCSF Medical Center'@en,Q7864122,37.763200,-122.458000,1907.0,796.000000,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,'database'@en,01005,0.6891,0.324
3,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,'database'@en,'Alzheimer\'s disease'@en,01007,0.3368,0.136
4,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'RNA sequencing'@en,'database'@en,'Alzheimer\'s disease'@en,'statistics'@en,'inflammation'@en,01009,0.758,0.199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3618,,,,,,,,,,,,,,,,,,,,
3619,,,,,,,,,,,,,,,,,,,,
3620,,,,,,,,,,,,,,,,,,,,
3621,,,,,,,,,,,,,,,,,,,,


In [108]:
for i in range(0, len(cov)):
    out.FIPS[i] = cov['Unnamed: 1'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [82]:
out[out['FIPS'] != '']

Unnamed: 0,cluster,org_name,org_node,org_lat,org_long,org_inc,hosp_name,hosp_node,hosp_lat,hosp_long,hosp_inc,hosp_bed,subject1,subject2,subject3,subject4,subject5,FIPS,n1,n2
0,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'phosphorylation'@en,'Toll-like receptor'@en,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,01001,0.8057,0.391
1,cluster_0_0_0_3_12_20_131_236_1292,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'Toll-like receptor'@en,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,01003,0.8362,0.452
2,cluster_0_0_0_3_12_19_26_39_990,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'UCSF Medical Center'@en,Q7864122,37.763200,-122.458000,1907.0,796.000000,'CRISPR'@en,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,'database'@en,01005,0.6891,0.324
3,cluster_0_0_0_5_34_82_165_315_1172,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'corpus striatum'@en,'dopamine'@en,'RNA sequencing'@en,'database'@en,'Alzheimer\'s disease'@en,01007,0.3368,0.136
4,cluster_0_1_1_12_27_47_75_129_446,'Yale University'@en,Q49112,41.311111,-72.926667,1701.0,'Smilow Cancer Hospital'@en,Q50037095,41.304278,-72.935676,2009.0,657.884393,'RNA sequencing'@en,'database'@en,'Alzheimer\'s disease'@en,'statistics'@en,'inflammation'@en,01009,0.758,0.199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,cluster_0_0_0_3_12_20_27_233_494,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'RNA sequencing'@en,'apoptotic process'@en,'molecular chaperones'@en,'schizophrenia'@en,'BH3 interacting domain death agonist'@en,17045,0.9221,0.173
618,cluster_0_1_1_12_29_57_94_232_1261,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'apoptotic process'@en,'molecular chaperones'@en,'schizophrenia'@en,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,17047,0.7801,0.188
619,cluster_0_0_0_3_12_20_27_233_494,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'schizophrenia'@en,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,'Chemical modulation of chaperone-mediated aut...,'Fanconi syndrome'@en,17049,0.9528,0.586
620,cluster_0_1_1_12_29_57_94_232_1261,'Albert Einstein College of Medicine'@en,Q2030894,40.850852,-73.844949,1955.0,"'Calvary Hospital, Bronx'@en",Q5024190,40.847986,-73.843853,1899.0,225.000000,'BH3 interacting domain death agonist'@en,'Bcl-xL'@en,'Chemical modulation of chaperone-mediated aut...,'Fanconi syndrome'@en,'autoinhibition'@en,17051,0.5575,0.198


In [109]:
out.to_csv('/Users/grantxie/Downloads/NIH/nih_0909_new.tsv', index = False, sep ='\t')