In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import sqlite3


In [2]:
def add_taxonomy(dataframe):
    result_values = {}
    for i in range(1, 16):
        codes_columns = f'Healthcare Provider Taxonomy Code_{i}'
        switch_columns = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        
        # Check the condition and append values to the result list
        keys = dataframe['NPI'][dataframe[switch_columns]=='Y'].tolist()
        values = dataframe[codes_columns][dataframe[switch_columns]=='Y']
        for key, value in zip(keys,values):
            result_values[key] = value
            
    taxonomy_df = pd.DataFrame(list(result_values.items()), columns=['NPI', 'Taxonomy_Code'])
    nppes_merged = dataframe.merge(taxonomy_df, on = 'NPI', how = 'left')
    return nppes_merged

In [3]:
def Convert_strings_to_nan(value):
    try:
        return float(value)
    except ValueError:
        return np.nan

In [4]:
def fix_zipcode(series):
    series = pd.to_numeric(series)
    series = series.fillna(0)
    # create a new series with zipcodes
    new_series = series.apply(lambda x : str(int(x)).zfill(9) if len(str(int(x)))>5 else str(x).zfill(5))    
    # now i can split the strings at 5th item
    return new_series.str[:5]

In [5]:
columns_to_keep = ['NPI', 
                   'Entity Type Code', 
                   'Provider Organization Name (Legal Business Name)',
                   'Provider Last Name (Legal Name)',
                    'Provider First Name',
                    'Provider Middle Name',
                    'Provider Name Prefix Text',
                    'Provider Name Suffix Text',
                    'Provider Credential Text',       
                    'Provider First Line Business Practice Location Address',
                    'Provider Second Line Business Practice Location Address',
                    'Provider Business Practice Location Address City Name',
                    'Provider Business Practice Location Address State Name',
                    'Provider Business Practice Location Address Postal Code',
                    "Healthcare Provider Taxonomy Code_1",
                    "Healthcare Provider Primary Taxonomy Switch_1",
                    "Healthcare Provider Taxonomy Code_2",
                    "Healthcare Provider Primary Taxonomy Switch_2",
                    "Healthcare Provider Taxonomy Code_3",
                    "Healthcare Provider Primary Taxonomy Switch_3",
                    "Healthcare Provider Taxonomy Code_4",
                    "Healthcare Provider Primary Taxonomy Switch_4",
                    "Healthcare Provider Taxonomy Code_5",
                    "Healthcare Provider Primary Taxonomy Switch_5",
                    "Healthcare Provider Taxonomy Code_6",
                    "Healthcare Provider Primary Taxonomy Switch_6",
                    "Healthcare Provider Taxonomy Code_7",
                    "Healthcare Provider Primary Taxonomy Switch_7",
                    "Healthcare Provider Taxonomy Code_8",
                    "Healthcare Provider Primary Taxonomy Switch_8",
                    "Healthcare Provider Taxonomy Code_9",
                    "Healthcare Provider Primary Taxonomy Switch_9",
                    "Healthcare Provider Taxonomy Code_10",
                    "Healthcare Provider Primary Taxonomy Switch_10",
                    "Healthcare Provider Taxonomy Code_11",
                    "Healthcare Provider Primary Taxonomy Switch_11",
                    "Healthcare Provider Taxonomy Code_12",
                    "Healthcare Provider Primary Taxonomy Switch_12",
                    "Healthcare Provider Taxonomy Code_13",
                    "Healthcare Provider Primary Taxonomy Switch_13",
                    "Healthcare Provider Taxonomy Code_14",
                    "Healthcare Provider Primary Taxonomy Switch_14",
                    "Healthcare Provider Taxonomy Code_15",
                    "Healthcare Provider Primary Taxonomy Switch_15"
                    ]

In [6]:
cbsa = pd.read_csv('../data/ZIP_CBSA_122023.xlsx - Export Worksheet.csv')
taxonomy_code_classification =  pd.read_csv('../data/nucc_taxonomy_240.csv')

In [7]:
cbsa['zipcodes'] = fix_zipcode(cbsa['ZIP'])


In [8]:
# db = sqlite3.connect('../data/npi.sqlite')
# hop = []
# for chunk in pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize=10000):
#      chunk = chunk[(chunk['transaction_count'] > 50) & (chunk['average_day_wait'] < 50)]
#      hop.append(chunk)
# hop_df = pd.concat(hop, ignore_index=True)
# hop_df.to_sql('hop_npi', db, if_exists='replace', index=False)
# db.execute('CREATE INDEX IF NOT EXISTS from_npi ON hop_npi(from_npi)')
# db.close()

In [9]:
# db = sqlite3.connect('../data/npi.sqlite')

# chunks = []
# for chunk in pd.read_csv('../data/npidata_pfile_20050523-20240211.csv', usecols=columns_to_keep, chunksize=10000):
#     chunk_taxonomy = add_taxonomy(chunk)
#     chunk_merged = pd.merge(left=chunk_taxonomy, right=taxonomy_code_classification[['Code', 'Classification']].set_index('Code'), how='left', left_on='Taxonomy_Code', right_index=True)
#     chunk_merged['Provider Business Practice Location Address Postal Code'] = chunk_merged['Provider Business Practice Location Address Postal Code'].apply(Convert_strings_to_nan)
#     chunk_merged['zipcodes'] = fix_zipcode(chunk_merged['Provider Business Practice Location Address Postal Code'])
#     chunk_merged_cbsa = pd.merge(left=chunk_merged, right=cbsa[['zipcodes', 'CBSA']].set_index('zipcodes'), how='left', left_on='zipcodes', right_index=True)
#     chunk_merged_cbsa['CBSA'] = chunk_merged_cbsa['CBSA'].fillna(0).astype(int)
#     chunks.append(chunk_merged_cbsa)

# npi = pd.concat(chunks, ignore_index=True)

# npi.to_sql('npi_nppes', db, if_exists = 'replace', index = False)

# db.execute('CREATE INDEX IF NOT EXISTS nppes_npi ON npi_nppes(NPI)')
# db.close()

In [10]:
#query confirming the amount of rows and columns are in the npi table
# db = sqlite3.connect('../data/npi.sqlite')
# query = "SELECT * FROM hop_npi" #WHERE patient_count > 1000"
# hop_teams = pd.read_sql(query, db)
# db.close()
# hop_teams

In [11]:
#query confirming the amount of rows and columns are in the nppes table
# db = sqlite3.connect('../data/npi.sqlite')
# query = "SELECT * FROM npi_nppes" #WHERE patient_count > 1000"
# nppes_teams = pd.read_sql(query, db)
# db.close()
# nppes_teams

In [12]:
#Checking for Vanderbuilt
query = """
SELECT *
FROM npi_nppes
WHERE "Provider Organization Name (Legal Business Name)" LIKE '%Vanderbilt%'
"""
with sqlite3.connect('../data/npi.sqlite') as db:
    finding_vanderbilt = pd.read_sql(query, db)
    
finding_vanderbilt

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Healthcare Provider Taxonomy Code_1,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Taxonomy Code_2,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Taxonomy Code_3,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Taxonomy Code_4,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Taxonomy Code_5,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Taxonomy Code_6,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Taxonomy Code_7,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Taxonomy Code_8,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Taxonomy Code_9,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Taxonomy Code_10,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Taxonomy Code_11,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Taxonomy_Code,Classification,zipcodes,CBSA
0,1124017959,2.0,VANDERBILT UNIVERSITY MEDICAL SCHOOL,,,,,,,VANDERBILT UNIVERSITY DIVISION OF MEDICAL GENE...,DD 2205 MCN,NASHVILLE,TN,372320001.0,170300000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,170300000X,"Genetic Counselor, MS",37232,34980
1,1598738205,2.0,VANDERBILT STALLWORTH REHABILITATION HOSPITAL LP,,,,,,,2201 CHILDRENS WAY,,NASHVILLE,TN,372123164.0,283X00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,283X00000X,Rehabilitation Hospital,37212,34980
2,1992770119,2.0,VANDERBILT MEDICAL CENTER,,,,,,,S-3414 MEDICAL CENTER NORTH,C/O INTERNAL MEDICINE DEPT,NASHVILLE,TN,372320001.0,282N00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,282N00000X,General Acute Care Hospital,37232,34980
3,1952356065,2.0,VANDERBILT UNIVERSITY,,,,,,,3601 THE VANDERBILT CLINIC,,NASHVILLE,TN,372325100.0,204E00000X,N,204F00000X,N,207L00000X,N,207P00000X,N,207Q00000X,N,207V00000X,N,207W00000X,N,207X00000X,N,207Y00000X,N,208000000X,N,2084N0400X,N,2084P0800X,N,208600000X,N,208800000X,N,207R00000X,Y,207R00000X,Internal Medicine,37232,34980
4,1821030842,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,500 PARK AVENUE,,LEBANON,TN,370873721.0,273R00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,273R00000X,Psychiatric Unit,37087,34980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1265210843,2.0,"VANDERBILT INTEGRATED PROVIDERS, LLC",,,,,,,1440 CEDAR LN STE 200,,TULLAHOMA,TN,373882486.0,261QR1300X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,261QR1300X,Clinic/Center,37388,43180
154,1164200747,2.0,"VANDERBILT INTEGRATED PROVIDERS, LLC",,,,,,,1805 N JACKSON ST STE 100,,TULLAHOMA,TN,373882291.0,261QR1300X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,261QR1300X,Clinic/Center,37388,46100
155,1164200747,2.0,"VANDERBILT INTEGRATED PROVIDERS, LLC",,,,,,,1805 N JACKSON ST STE 100,,TULLAHOMA,TN,373882291.0,261QR1300X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,261QR1300X,Clinic/Center,37388,43180
156,1891573473,2.0,"VANDERBILT INTEGRATED PROVIDERS, LLC",,,,,,,1330 CEDAR LN STE 100,,TULLAHOMA,TN,373882284.0,261QR1300X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,261QR1300X,Clinic/Center,37388,46100


In [13]:
#table with the correct Vanderbilt organization
query = """
SELECT *
FROM npi_nppes
WHERE "Provider Organization Name (Legal Business Name)" = 'VANDERBILT UNIVERSITY MEDICAL CENTER'
"""
with sqlite3.connect('../data/npi.sqlite') as db:
    The_vanderbilt = pd.read_sql(query, db)
    
The_vanderbilt

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Healthcare Provider Taxonomy Code_1,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Taxonomy Code_2,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Taxonomy Code_3,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Taxonomy Code_4,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Taxonomy Code_5,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Taxonomy Code_6,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Taxonomy Code_7,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Taxonomy Code_8,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Taxonomy Code_9,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Taxonomy Code_10,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Taxonomy Code_11,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Taxonomy_Code,Classification,zipcodes,CBSA
0,1821030842,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,500 PARK AVENUE,,LEBANON,TN,370873721.0,273R00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,273R00000X,Psychiatric Unit,37087,34980
1,1215979190,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,500 PARK AVENUE,,LEBANON,TN,370873721.0,273Y00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,273Y00000X,Rehabilitation Unit,37087,34980
2,1306889597,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,1411 W. BADDOUR PARKWAY,,LEBANON,TN,370872513.0,282N00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,282N00000X,General Acute Care Hospital,37087,34980
3,1700800877,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,2200 CHILDRENS WAY,ROOM 2106A,NASHVILLE,TN,372329650.0,3336C0003X,N,3336S0011X,N,3336I0012X,Y,,,,,,,,,,,,,,,,,,,,,,,,,3336I0012X,Pharmacy,37232,34980
4,1700950284,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,1215 21ST AVE S,3108 MEDICAL CENTER EAST,NASHVILLE,TN,372320014.0,282NC0060X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,282NC0060X,General Acute Care Hospital,37232,34980
5,1558408633,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,1601 23RD AVE S,,NASHVILLE,TN,372123133.0,283Q00000X,N,282N00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,282N00000X,General Acute Care Hospital,37212,34980
6,1396882205,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,1211 MEDICAL CENTER DRIVE,,NASHVILLE,TN,372320004.0,273R00000X,N,282N00000X,N,282NC2000X,N,283Q00000X,N,291U00000X,N,3416A0800X,N,3416L0300X,N,282N00000X,Y,,,,,,,,,,,,,,,282N00000X,General Acute Care Hospital,37232,34980
7,1215078027,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,1601 23RD AVE S,SUITE 1096,NASHVILLE,TN,372320001.0,101Y00000X,N,261QM0801X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,261QM0801X,Clinic/Center,37232,34980
8,1053449660,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,20 RACHEL DRIVE,,NASHVILLE,TN,372143609.0,261Q00000X,N,282N00000X,N,261QE0700X,Y,,,,,,,,,,,,,,,,,,,,,,,,,261QE0700X,Clinic/Center,37214,34980
9,1740319847,2.0,VANDERBILT UNIVERSITY MEDICAL CENTER,,,,,,,2906 FOSTER CREIGHTON DR STE 100,,NASHVILLE,TN,372043733.0,261Q00000X,N,282N00000X,N,261QE0700X,Y,,,,,,,,,,,,,,,,,,,,,,,,,261QE0700X,Clinic/Center,37204,34980


In [29]:
##finding the family medicine providers that refers the most patients.
query = """
SELECT "NPI","Provider Organization Name (Legal Business Name)", "Classification", SUM("patient_count") AS "Amount of Patients", SUM("transaction_count") AS Total_referrals
FROM npi_nppes AS nppes
INNER JOIN hop_npi AS npi
ON nppes.NPI = npi.from_npi
WHERE "Classification" = 'Family Medicine' AND "Provider Business Practice Location Address City Name" LIKE "%NASHVILLE%"
GROUP BY 1,2,3
ORDER BY 5 DESC;
"""
with sqlite3.connect('../data/npi.sqlite') as db:
    family_med = pd.read_sql(query, db)
    
family_med

Unnamed: 0,NPI,CBSA,Provider Organization Name (Legal Business Name),Classification,Amount of Patients,Total_referrals
0,1700873171,34980,COLUMBIA MEDICAL GROUP-SOUTHERN HILLS INC,Family Medicine,25578,46937
1,1518962968,26260,,Family Medicine,6597,16790
2,1518962968,99999,,Family Medicine,6597,16790
3,1386865442,26260,FERGUSON HEALTH CARE INC,Family Medicine,5369,13653
4,1386865442,99999,FERGUSON HEALTH CARE INC,Family Medicine,5369,13653
...,...,...,...,...,...,...
151,1790710978,34980,,Family Medicine,48,58
152,1639221641,34980,,Family Medicine,18,57
153,1215364633,34980,"AMERICA CARES TRUST, INC",Family Medicine,18,56
154,1487941522,34980,,Family Medicine,23,56


In [27]:
#finding the orthopaedic surgeons providers that refers the most patients.
query = """
SELECT "NPI", "cbsa" ,"Provider Organization Name (Legal Business Name)", "Classification", "from_npi", SUM("patient_count") AS "Amount of Patients", SUM("transaction_count") AS Total_Patients_referrals
FROM npi_nppes AS nppes
INNER JOIN hop_npi AS npi
ON nppes.NPI = npi.from_npi
WHERE "Classification" = 'Orthopaedic Surgery' AND "Provider Business Practice Location Address City Name" LIKE "%NASHVILLE%"
GROUP BY 1,2,3,4
ORDER BY 6 DESC;
"""
with sqlite3.connect('../data/npi.sqlite') as db:
    ortho_surgery = pd.read_sql(query, db)
    
ortho_surgery

Unnamed: 0,NPI,Provider Organization Name (Legal Business Name),Classification,from_npi,Amount of Patients,Total_Patients_refered
0,1538153937,TENNESSEE ORTHOPAEDIC ALLIANCE PA,Orthopaedic Surgery,1538153937,187401,306549
1,1972577351,PREMIER ORTHOPAEDICS & SPORTS MEDICINE PLC,Orthopaedic Surgery,1972577351,36614,63115
2,1164834057,"TRISTAR JOINT REPLACEMENT INSTITUTE, LLC",Orthopaedic Surgery,1164834057,20673,31116
3,1528050671,,Orthopaedic Surgery,1528050671,4094,6639
4,1245222397,,Orthopaedic Surgery,1245222397,3785,6026
...,...,...,...,...,...,...
95,1497855456,,Orthopaedic Surgery,1497855456,81,278
96,1346753837,"ACTION SPINE & JOINT, LLC.",Orthopaedic Surgery,1346753837,74,396
97,1588701999,,Orthopaedic Surgery,1588701999,53,92
98,1720354343,,Orthopaedic Surgery,1720354343,47,77


In [26]:
# query = """
# SELECT "NPI", "Provider Organization Name (Legal Business Name)", "Classification", "from_npi", SUM("patient_count"), SUM("transaction_count")
# FROM npi_nppes AS nppes
# INNER JOIN hop_npi AS npi
# ON nppes.NPI = npi.from_npi
# WHERE "Classification" = 'Orthopaedic Surgery' OR "Classification" = 'Family Medicine' AND "Provider Business Practice Location Address City Name" LIKE "%NASHVILLE%"
# GROUP BY 1,2,3,4
# ORDER BY 5 DESC;
# """
# with sqlite3.connect('../data/npi.sqlite') as db:
#     two_classifications_2 = pd.read_sql(query, db)
    
# two_classifications_2

Unnamed: 0,NPI,Provider Organization Name (Legal Business Name),Classification,from_npi,"SUM(""patient_count"")","SUM(""transaction_count"")"
0,1144513375,"ST. VINCENT MEDICAL GROUP, INC.",Orthopaedic Surgery,1144513375,1244365,2012909
1,1376592964,EMERGEORTHO PA,Orthopaedic Surgery,1376592964,606427,1043588
2,1437382074,"COMMUNITY MEDICAL ASSOCIATES, INC.",Orthopaedic Surgery,1437382074,488356,737114
3,1497704217,"ORTHOCAROLINA, PA",Orthopaedic Surgery,1497704217,479617,770421
4,1205922432,ILLINOIS BONE AND JOINT INSTITUTE LLC,Orthopaedic Surgery,1205922432,462404,818764
...,...,...,...,...,...,...
25817,1972866457,"ALLEN ORTHOPEDICS,LLC",Orthopaedic Surgery,1972866457,18,56
25818,1629034137,,Orthopaedic Surgery,1629034137,16,87
25819,1770945149,,Family Medicine,1770945149,14,66
25820,1073886578,"INTEGRATED PHYSICAL MEDICINE, INC.",Orthopaedic Surgery,1073886578,13,109


In [20]:
# query = """
# SELECT *
# FROM hop_npi
# LIMIT 1;
# """
# with sqlite3.connect('../data/npi.sqlite') as db:
#     npi_columns = pd.read_sql(query, db)
    
# npi_columns

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508085911,1730166125,58,67,23.925,43.923


In [4]:
# db = sqlite3.connect('../data/npi.sqlite')

# query = 'SELECT "Provider Organization Name (Legal Business Name)", "Provider Business Practice Location Address City Name", "NPI", "Classification", "Taxonomy_Code" FROM npi_nppes WHERE "Provider Business Practice Location Address City Name" LIKE "%NASHVILLE%"'

# Nashville_Practices = pd.read_sql(query, db)

# db.close()
#Nashville_Practices

In [None]:
# #db = sqlite3.connect('data/npi.sqlite')
# for chunk in pd.read_csv('data/NPPES_Data_Dissemination_February_2024/npidata_pfile_20050523-20240211.csv', 
#                           usecols= columns_to_keep,
#                               chunksize = 10000):
#       chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
#     chunk_taxonomy = add_taxonomy(chunk)
#     chunk_merged = pd.merge(left = chunk_taxonomy, 
#                             right = taxonomy_code_classification[['Code', 'Classification']].set_index('Code'), 
#                             how = 'left',
#                             left_on = 'Taxonomy_Code',
#                             right_index = True)
#     chunk_merged['Provider Business Practice Location Address Postal Code'] = chunk_merged['Provider Business Practice Location Address Postal Code'].apply(Convert_strings_to_nan)
#     chunk_merged['zipcodes']=fix_zipcode(
#         chunk_merged['Provider Business Practice Location Address Postal Code']
#         )
#     chunk_merged_cbsa = pd.merge(left = chunk_merged, 
#          right = cbsa[['zipcodes','CBSA']].set_index('zipcodes'), 
#          how = 'left',
#          left_on = 'zipcodes', 
#          right_index = True)
    
#     chunk_merged_cbsa['CBSA']= chunk_merged_cbsa['CBSA'].fillna(0).astype(int)
    
#     chunk_merged_cbsa.to_sql('nppes', 
#                 db, 
#                 if_exists = 'append', 
#                 index = False)  

# #db.execute('CREATE INDEX NPI ON nppes(NPI)')
# db.close()

In [None]:
# chunks = []

# for chunk in pd.read_csv('../data/npidata_pfile_20050523-20240211.csv', 
#                          usecols=columns_to_keep, 
#                          chunksize=10000):
#     #chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
#     chunk_taxonomy = add_taxonomy(chunk)
#     chunk_merged = pd.merge(left = chunk_taxonomy, right = taxonomy_code_classification[['Code', 'Classification']].set_index('Code'), how ='left', left_on ='Taxonomy_Code', right_index = True)
#     chunk_merged['Provider Business Practice Location Address Postal Code'] = chunk_merged['Provider Business Practice Location Address Postal Code'].apply(Convert_strings_to_nan)
#     chunk_merged['zipcodes'] = fix_zipcode(chunk_merged['Provider Business Practice Location Address Postal Code'])
#     chunk_merged_cbsa = pd.merge(left = chunk_merged, right = cbsa[['zipcodes','CBSA']].set_index('zipcodes'), how ='left', left_on ='zipcodes', right_index = True)
#     chunk_merged_cbsa['CBSA'] = chunk_merged_cbsa['CBSA'].fillna(0).astype(int)
#     chunks.append(chunk_merged_cbsa)

# # Concatenate all chunks into a single DataFrame
# npi = pd.concat(chunks, ignore_index=True)

In [None]:
# hop = []
# chunks = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', 
#                               chunksize = 10000)
# for chunk in chunks:
#     chunk = chunk[chunk['transaction_count']>50]
#     chunk = chunk[chunk['average_day_wait']<50]
#     hop.append(chunk)
    
# hop = pd.concat(hop, ignore_index = True)

In [None]:
# db = sqlite3.connect('../data/npi.sqlite')

# hop = []
# for chunk in pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize=10000):
# chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
#     chunk = chunk[(chunk['transaction_count'] > 50) & (chunk['average_day_wait'] < 50)]
#     hop.append(chunk)

# hop_df = pd.concat(hop, ignore_index=True)

# hop_df.to_sql('hop_npi', db, if_exists='replace', index=False)

# db.execute('CREATE INDEX IF NOT EXISTS from_npi ON hop_npi(from_npi)')
# db.close()

In [None]:
# db = sqlite3.connect('../data/npi.sqlite')

# chunks = []
# for chunk in pd.read_csv('../data/npidata_pfile_20050523-20240211.csv', usecols=columns_to_keep, chunksize=10000):
# chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
#     chunk_taxonomy = add_taxonomy(chunk)
#     chunk_merged = pd.merge(left=chunk_taxonomy, right=taxonomy_code_classification[['Code', 'Classification']].set_index('Code'), how='left', left_on='Taxonomy_Code', right_index=True)
#     chunk_merged['Provider Business Practice Location Address Postal Code'] = chunk_merged['Provider Business Practice Location Address Postal Code'].apply(Convert_strings_to_nan)
#     chunk_merged['zipcodes'] = fix_zipcode(chunk_merged['Provider Business Practice Location Address Postal Code'])
#     chunk_merged_cbsa = pd.merge(left=chunk_merged, right=cbsa[['zipcodes', 'CBSA']].set_index('zipcodes'), how='left', left_on='zipcodes', right_index=True)
#     chunk_merged_cbsa['CBSA'] = chunk_merged_cbsa['CBSA'].fillna(0).astype(int)
#     chunks.append(chunk_merged_cbsa)

# npi = pd.concat(chunks, ignore_index=True)

# npi.to_sql('npi_nppes', db, if_exists = 'replace', index = False)

# db.execute('CREATE INDEX IF NOT EXISTS NPI ON npi_nppes(NPI)')
# db.close()

In [None]:
#What are the major Nashville Hospitals?
    #Vanderbilt
    #Centennial
    #St. Thomas (midtown, west)
    #Nashville General
    #VA