In [38]:
import numpy as np
import pandas as pd
import sqlite3
pd.set_option('display.max_columns', 200)

In [2]:
#define a few functions 

# Functions to use to add taxonomy codes to nppes

def add_taxonomy(dataframe):
    result_values = {}
    for i in range(1, 16):
        codes_columns = f'Healthcare Provider Taxonomy Code_{i}'
        switch_columns = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        
        # Check the condition and append values to the result list
        keys = dataframe['NPI'][dataframe[switch_columns]=='Y'].tolist()
        values = dataframe[codes_columns][dataframe[switch_columns]=='Y']
        for key, value in zip(keys,values):
            result_values[key] = value
            
    taxonomy_df = pd.DataFrame(list(result_values.items()), columns=['NPI', 'Taxonomy_Code'])
    nppes_merged = dataframe.merge(taxonomy_df, on = 'NPI', how = 'left')
    return nppes_merged
                


# function to clean up zipcodes

def Convert_strings_to_nan(value):
    try:
        return float(value)
    except ValueError:
        return np.nan

def fix_zipcode(series):
    # fill na with zeroes
    #change to int so the zeroes at the end gets removed
    # then change to strings and use zfill to add leading zeroes
    series = series.fillna(0).astype(float).astype(int).astype(str).str.zfill(5)
    # now i can split the strings at 5th item
    return series.str[:5]


columns_to_keep = ['NPI', 
                   'Entity Type Code', 
                   'Provider Organization Name (Legal Business Name)',
                   'Provider Last Name (Legal Name)',
                    'Provider First Name',
                    'Provider Middle Name',
                    'Provider Name Prefix Text',
                    'Provider Name Suffix Text',
                    'Provider Credential Text',       
                    'Provider First Line Business Practice Location Address',
                    'Provider Second Line Business Practice Location Address',
                    'Provider Business Practice Location Address City Name',
                    'Provider Business Practice Location Address State Name',
                    'Provider Business Practice Location Address Postal Code',
                    "Healthcare Provider Taxonomy Code_1",
                    "Healthcare Provider Primary Taxonomy Switch_1",
                    "Healthcare Provider Taxonomy Code_2",
                    "Healthcare Provider Primary Taxonomy Switch_2",
                    "Healthcare Provider Taxonomy Code_3",
                    "Healthcare Provider Primary Taxonomy Switch_3",
                    "Healthcare Provider Taxonomy Code_4",
                    "Healthcare Provider Primary Taxonomy Switch_4",
                    "Healthcare Provider Taxonomy Code_5",
                    "Healthcare Provider Primary Taxonomy Switch_5",
                    "Healthcare Provider Taxonomy Code_6",
                    "Healthcare Provider Primary Taxonomy Switch_6",
                    "Healthcare Provider Taxonomy Code_7",
                    "Healthcare Provider Primary Taxonomy Switch_7",
                    "Healthcare Provider Taxonomy Code_8",
                    "Healthcare Provider Primary Taxonomy Switch_8",
                    "Healthcare Provider Taxonomy Code_9",
                    "Healthcare Provider Primary Taxonomy Switch_9",
                    "Healthcare Provider Taxonomy Code_10",
                    "Healthcare Provider Primary Taxonomy Switch_10",
                    "Healthcare Provider Taxonomy Code_11",
                    "Healthcare Provider Primary Taxonomy Switch_11",
                    "Healthcare Provider Taxonomy Code_12",
                    "Healthcare Provider Primary Taxonomy Switch_12",
                    "Healthcare Provider Taxonomy Code_13",
                    "Healthcare Provider Primary Taxonomy Switch_13",
                    "Healthcare Provider Taxonomy Code_14",
                    "Healthcare Provider Primary Taxonomy Switch_14",
                    "Healthcare Provider Taxonomy Code_15",
                    "Healthcare Provider Primary Taxonomy Switch_15"
                    ]

In [7]:
#load files
cbsa = pd.read_excel('../data/ZIP_CBSA.xlsx')
taxonomy_code_classification =  pd.read_csv('../data/nucc_taxonomy_240.csv')

In [8]:
#fix zipcodes in cbsa data
cbsa['zipcodes'] = fix_zipcode(cbsa['ZIP'])

In [13]:
#read the npi data in chunk, filter them by some conditions and then write to sql.
db = sqlite3.connect('../data/npi.sqlite')
for chunk in pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', 
                              chunksize = 10000):
    chunk = chunk[chunk['transaction_count']>50]
    chunk = chunk[chunk['average_day_wait']<50]
    chunk.to_sql('npi', 
                db, 
                if_exists = 'append', 
                index = False)  
    
db.execute('CREATE INDEX from_npi ON npi(from_npi)')
db.close()

In [16]:
# load the nppes data to sqlite database

db = sqlite3.connect('../data/npi.sqlite')
for chunk in pd.read_csv('../data/npidata.csv', 
                          usecols= columns_to_keep,
                              chunksize = 10000):
    chunk_taxonomy = add_taxonomy(chunk)
    chunk_merged = pd.merge(left = chunk_taxonomy, 
                            right = taxonomy_code_classification[['Code', 'Classification']].set_index('Code'), 
                            how = 'left',
                            left_on = 'Taxonomy_Code',
                            right_index = True)
    chunk_merged['Provider Business Practice Location Address Postal Code'] = chunk_merged['Provider Business Practice Location Address Postal Code'].apply(Convert_strings_to_nan)
    chunk_merged['zipcodes']=fix_zipcode(
        chunk_merged['Provider Business Practice Location Address Postal Code']
        )
    chunk_merged_cbsa = pd.merge(left = chunk_merged, 
         right = cbsa[['zipcodes','CBSA']].set_index('zipcodes'), 
         how = 'left',
         left_on = 'zipcodes', 
         right_index = True)
    
    chunk_merged_cbsa['CBSA']= chunk_merged_cbsa['CBSA'].fillna(0).astype(int)
    
    chunk_merged_cbsa.to_sql('nppes', 
                db, 
                if_exists = 'append', 
                index = False)  

db.execute('CREATE INDEX NPI ON nppes(NPI)')
db.close()

  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',
  for chunk in pd.read_csv('../data/npidata.csv',


OperationalError: there is already a table named NPI

In [26]:
db = sqlite3.connect('../data/npi.sqlite')

query = '''
SELECT *
FROM nppes
LIMIT 150000
'''
postal_code = pd.read_sql(query, db)

db.close()

In [27]:
postal_code

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Taxonomy_Code,Classification,zipcodes,CBSA
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,...,,,,,,,207X00000X,Orthopaedic Surgery,68847,28260
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,...,,,,,,,207RC0000X,Internal Medicine,32204,27260
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,...,,,,,,,251G00000X,"Hospice Care, Community Based",28304,22180
3,1306849450,,,,,,,,,,...,,,,,,,,,00000,0
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,...,,,,,,,207RH0003X,Internal Medicine,77090,26420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,1841285731,1.0,,NEVILLS,KAREN,C,,,NP,531 WASHINGTON ST,...,,,,,,,363LF0000X,Nurse Practitioner,13601,48060
149996,1841285731,1.0,,NEVILLS,KAREN,C,,,NP,531 WASHINGTON ST,...,,,,,,,363LF0000X,Nurse Practitioner,13601,99999
149997,1750376646,2.0,DANIEL S DEBLASIO MD PC,,,,,,,830 WASHINGTON ST,...,,,,,,,2085R0001X,Radiology,13601,48060
149998,1750376646,2.0,DANIEL S DEBLASIO MD PC,,,,,,,830 WASHINGTON ST,...,,,,,,,2085R0001X,Radiology,13601,99999


In [24]:
db = sqlite3.connect('../data/npi.sqlite')
query = "SELECT * FROM npi WHERE patient_count > 1000"
hop_teams_selected = pd.read_sql(query, db)
db.close()

In [25]:
hop_teams_selected

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508008921,1730173154,1179,1303,0.000,0.000
1,1508874454,1730174293,1587,1814,10.043,34.372
2,1508815333,1730180589,1295,1742,10.356,35.110
3,1508804048,1730182023,1171,1744,19.586,39.585
4,1508077892,1730183286,2659,3089,25.703,48.484
...,...,...,...,...,...,...
530547,1417140864,1497853683,1271,4411,0.182,4.222
530548,1417189226,1497859649,1144,1186,0.000,0.000
530549,1417117912,1497869101,1654,3173,0.014,0.555
530550,1417919531,1497893614,1652,4291,0.303,6.786


In [32]:
db = sqlite3.connect('../data/npi.sqlite')

query = '''
SELECT * 
FROM nppes 
LIMIT 100000
'''

hop_teams_selected = pd.read_sql(query, db)
db.close()

In [33]:
hop_teams_selected

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Taxonomy_Code,Classification,zipcodes,CBSA
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,...,,,,,,,207X00000X,Orthopaedic Surgery,68847,28260
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,...,,,,,,,207RC0000X,Internal Medicine,32204,27260
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,...,,,,,,,251G00000X,"Hospice Care, Community Based",28304,22180
3,1306849450,,,,,,,,,,...,,,,,,,,,00000,0
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,...,,,,,,,207RH0003X,Internal Medicine,77090,26420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1609877745,1.0,,STONE,PAUL,D.,,,"PHARM.D., R.PH.",2001 N JEFFERSON AVE,...,,,,,,,183500000X,Pharmacist,75455,99999
99996,1518968650,1.0,,SAEGER,SCOTT,W,,,D.C.,294 HARLEYSVILLE PIKE,...,,,,,,,111N00000X,Chiropractor,18964,37980
99997,1427059567,1.0,,PESICK-CAINE,SHELLY,DEBBIE,,,M.D.,6559 WILSON MILLS RD,...,,,,,,,208000000X,Pediatrics,44143,17460
99998,1336140474,1.0,,COVINGTON,BENJAMIN,WILSON,DR.,IV,M.D.,730 EUREKA ST,...,,,,,,,207Q00000X,Family Medicine,76086,19100


In [35]:
hop_teams_selected['Classification'].value_counts()

Internal Medicine                13186
Family Medicine                   7894
Dentist                           4280
Nurse Practitioner                3771
Pediatrics                        3482
                                 ...  
Clinical Pharmacology                1
Technician                           1
Dietary Manager                      1
Chronic Disease Hospital             1
Military Health Care Provider        1
Name: Classification, Length: 174, dtype: int64

In [39]:
hop_teams_selected

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Healthcare Provider Taxonomy Code_1,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Taxonomy Code_2,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Taxonomy Code_3,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Taxonomy Code_4,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Taxonomy Code_5,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Taxonomy Code_6,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Taxonomy Code_7,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Taxonomy Code_8,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Taxonomy Code_9,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Taxonomy Code_10,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Taxonomy Code_11,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Taxonomy_Code,Classification,zipcodes,CBSA
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0,207X00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,207X00000X,Orthopaedic Surgery,68847,28260
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,207RC0000X,Y,207RC0000X,N,,,,,,,,,,,,,,,,,,,,,,,,,,,207RC0000X,Internal Medicine,32204,27260
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,251G00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,251G00000X,"Hospice Care, Community Based",28304,22180
3,1306849450,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,00000,0
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0,174400000X,N,207RH0003X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,207RH0003X,Internal Medicine,77090,26420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1609877745,1.0,,STONE,PAUL,D.,,,"PHARM.D., R.PH.",2001 N JEFFERSON AVE,,MOUNT PLEASANT,TX,754552338.0,183500000X,N,183500000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,183500000X,Pharmacist,75455,99999
99996,1518968650,1.0,,SAEGER,SCOTT,W,,,D.C.,294 HARLEYSVILLE PIKE,POB 64684,SOUDERTON,PA,189642137.0,111N00000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,111N00000X,Chiropractor,18964,37980
99997,1427059567,1.0,,PESICK-CAINE,SHELLY,DEBBIE,,,M.D.,6559 WILSON MILLS RD,BLDG D SUITE 101,MAYFIELD VILLAGE,OH,441436402.0,208000000X,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,208000000X,Pediatrics,44143,17460
99998,1336140474,1.0,,COVINGTON,BENJAMIN,WILSON,DR.,IV,M.D.,730 EUREKA ST,,WEATHERFORD,TX,760866546.0,207Q00000X,Y,207P00000X,N,208M00000X,N,,,,,,,,,,,,,,,,,,,,,,,,,207Q00000X,Family Medicine,76086,19100


In [40]:
db.close()