In [1]:
#Load data into sqlite database
import numpy as np
import pandas as pd
import sqlite3

#### Define a few functions

In [2]:
# Functions to use to add taxonomy codes to nppes

def add_taxonomy(dataframe):
    result_values = {}
    for i in range(1, 16):
        codes_columns = f'Healthcare Provider Taxonomy Code_{i}'
        switch_columns = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        
        # Check the condition and append values to the result list
        keys = dataframe['NPI'][dataframe[switch_columns]=='Y'].tolist()
        values = dataframe[codes_columns][dataframe[switch_columns]=='Y']
        for key, value in zip(keys,values):
            result_values[key] = value
            
    taxonomy_df = pd.DataFrame(list(result_values.items()), columns=['NPI', 'Taxonomy_Code'])
    nppes_merged = dataframe.merge(taxonomy_df, on = 'NPI', how = 'left')
    return nppes_merged

In [3]:

# function to clean up zipcodes

def Convert_strings_to_nan(value):
    try:
        return float(value)
    except ValueError:
        return np.nan


In [4]:
# New Function that takes care of the 8 digit zipcodes
def fix_zipcode(series):
    series = pd.to_numeric(series, errors='coerce')
    series = series.fillna(0)
    # create a new series with zipcodes
    new_series = series.apply(lambda x : str(int(x)).zfill(9) if len(str(int(x)))>5 else str(x).zfill(5))    
    # now i can split the strings at 5th item
    return new_series.str[:5]

In [5]:
columns_to_keep = ['NPI', 
                   'Entity Type Code', 
                   'Provider Organization Name (Legal Business Name)',
                   'Provider Last Name (Legal Name)',
                    'Provider First Name',
                    'Provider Middle Name',
                    'Provider Name Prefix Text',
                    'Provider Name Suffix Text',
                    'Provider Credential Text',       
                    'Provider First Line Business Practice Location Address',
                    'Provider Second Line Business Practice Location Address',
                    'Provider Business Practice Location Address City Name',
                    'Provider Business Practice Location Address State Name',
                    'Provider Business Practice Location Address Postal Code',
                    "Healthcare Provider Taxonomy Code_1",
                    "Healthcare Provider Primary Taxonomy Switch_1",
                    "Healthcare Provider Taxonomy Code_2",
                    "Healthcare Provider Primary Taxonomy Switch_2",
                    "Healthcare Provider Taxonomy Code_3",
                    "Healthcare Provider Primary Taxonomy Switch_3",
                    "Healthcare Provider Taxonomy Code_4",
                    "Healthcare Provider Primary Taxonomy Switch_4",
                    "Healthcare Provider Taxonomy Code_5",
                    "Healthcare Provider Primary Taxonomy Switch_5",
                    "Healthcare Provider Taxonomy Code_6",
                    "Healthcare Provider Primary Taxonomy Switch_6",
                    "Healthcare Provider Taxonomy Code_7",
                    "Healthcare Provider Primary Taxonomy Switch_7",
                    "Healthcare Provider Taxonomy Code_8",
                    "Healthcare Provider Primary Taxonomy Switch_8",
                    "Healthcare Provider Taxonomy Code_9",
                    "Healthcare Provider Primary Taxonomy Switch_9",
                    "Healthcare Provider Taxonomy Code_10",
                    "Healthcare Provider Primary Taxonomy Switch_10",
                    "Healthcare Provider Taxonomy Code_11",
                    "Healthcare Provider Primary Taxonomy Switch_11",
                    "Healthcare Provider Taxonomy Code_12",
                    "Healthcare Provider Primary Taxonomy Switch_12",
                    "Healthcare Provider Taxonomy Code_13",
                    "Healthcare Provider Primary Taxonomy Switch_13",
                    "Healthcare Provider Taxonomy Code_14",
                    "Healthcare Provider Primary Taxonomy Switch_14",
                    "Healthcare Provider Taxonomy Code_15",
                    "Healthcare Provider Primary Taxonomy Switch_15"
                    ]


In [6]:
#load files
cbsa = pd.read_csv('./data/ZIP_CBSA_122023.csv')
taxonomy_code_classification =  pd.read_csv('./data/nucc_taxonomy_240.csv')


In [7]:
#fix zipcodes in cbsa data

# do this manually because it is simpler.
cbsa['zipcodes'] = cbsa['ZIP'].apply(lambda x : str(x).zfill(9) if len(str(x))>5 else str(x).zfill(5)).str[:5]


#read the npi data in chunk, filter them by some conditions and then write to sql.
db = sqlite3.connect('data/npi_final.sqlite')
for chunk in pd.read_csv('./data/DocGraph_Hop_Teaming_2018.csv', 
                              chunksize = 10000):
    chunk = chunk[chunk['transaction_count']>50]
    chunk = chunk[chunk['average_day_wait']<50]
    chunk.to_sql('npi', 
                db, 
                if_exists = 'append', 
                index = False)  
    
db.execute('CREATE INDEX from_npi ON npi(from_npi)')
db.close()


# load the nppes data to sqlite database

db = sqlite3.connect('./data/npi_final.sqlite')
for chunk in pd.read_csv('./data/npidata_pfile_20050523-20240211.csv',
                         low_memory=False, 
                          usecols= columns_to_keep,
                              chunksize = 10000):
    chunk_taxonomy = add_taxonomy(chunk)
    chunk_merged = pd.merge(left = chunk_taxonomy, 
                            right = taxonomy_code_classification[['Code', 'Classification']].set_index('Code'), 
                            how = 'left',
                            left_on = 'Taxonomy_Code',
                            right_index = True)
    # chunk_merged['Provider Business Practice Location Address Postal Code'] = chunk_merged['Provider Business Practice Location Address Postal Code'].apply(Convert_strings_to_nan)
    chunk_merged['zipcodes'] = fix_zipcode(chunk_merged['Provider Business Practice Location Address Postal Code'])
    
    chunk_merged_cbsa = pd.merge(left = chunk_merged, 
         right = cbsa[['zipcodes','CBSA']].set_index('zipcodes'), 
         how = 'left',
         left_on = 'zipcodes', 
         right_index = True)
    
    chunk_merged_cbsa['CBSA']= chunk_merged_cbsa['CBSA'].fillna(0).astype(int)
    
    chunk_merged_cbsa.to_sql('nppes', 
                db, 
                if_exists = 'append', 
                index = False)  

db.execute('CREATE INDEX nppes_npi ON nppes(NPI)')
db.close()

In [12]:
db = sqlite3.connect('./data/npidata_final.sqlite')

query = "SELECT * FROM nppes LIMIT 5"

hopteaming = pd.read_sql(query, db)

db.close()

In [27]:
hopteaming.head()


Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,Taxonomy_Code,Classification,zipcodes,CBSA
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,...,,,,,,,207X00000X,Orthopaedic Surgery,68847,28260
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,...,,,,,,,207RC0000X,Internal Medicine,32204,27260
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,...,,,,,,,251G00000X,"Hospice Care, Community Based",28304,22180
3,1306849450,,,,,,,,,,...,,,,,,,,,0,0
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,...,,,,,,,207RH0003X,Internal Medicine,77090,26420


In [15]:
db = sqlite3.connect('./data/npidata_final.sqlite')

query = "SELECT * FROM npi LIMIT 5"

npi = pd.read_sql(query, db)

db.close()

In [16]:
npi.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508085911,1730166125,58,67,23.925,43.923
1,1508167040,1730166125,51,51,28.196,52.876
2,1508863549,1730166125,340,391,18.302,42.422
3,1508867870,1730166125,50,79,12.658,26.402
4,1508011040,1730166224,132,145,8.579,28.053


In [17]:
npi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   from_npi           5 non-null      int64  
 1   to_npi             5 non-null      int64  
 2   patient_count      5 non-null      int64  
 3   transaction_count  5 non-null      int64  
 4   average_day_wait   5 non-null      float64
 5   std_day_wait       5 non-null      float64
dtypes: float64(2), int64(4)
memory usage: 372.0 bytes


In [28]:
db = sqlite3.connect('./data/npidata_final.sqlite')

query = 'SELECT "Provider First Line Business Practice Location Address" FROM nppes'

rename_test = pd.read_sql(query, db)

db.close()

In [29]:
rename_test.head()

Unnamed: 0,Provider First Line Business Practice Location Address
0,3500 CENTRAL AVE
1,1824 KING STREET
2,3418 VILLAGE DR
3,
4,17323 RED OAK DR
