Converting Detailed Drug Dataset to a Crosstab of NPI vs Drug Claims

In [2]:
import pandas as pd
import numpy as np

REQUIRES the Detailed Drug Dataset and Provider Summary Dataset to be downloaded from https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/PartD2017.html

In [3]:
dd_df = pd.read_csv("data\Detailed_Drugs.txt", sep = '\t', header = 0, nrows = 1000000, low_memory = False)
#reads the Large txt file 1 million rows
npi_drug_cross_df = pd.crosstab(dd_df["npi"], dd_df["drug_name"] , values=dd_df["total_claim_count"], 
                                  aggfunc=np.sum) #uses drug_name which is the brand names
final_cross_df = npi_drug_cross_df.fillna(0) #removes NaN
final_cross_df.shape

(36541, 2080)

In [4]:
headers = dd_df.columns.tolist()
#saves headers

Running and Saving takes about 10-15 minutes 

In [5]:

skip = 1000000 
for x in range(25):
    #reads 25 million rows 1 million at a time
    dd_df = pd.read_csv("data\Detailed_Drugs.txt", sep = '\t', header=None, skiprows = skip, nrows = 1000000, low_memory = False)
    dd_df.columns=headers
    dd_df = dd_df.sort_values(by=["npi","specialty_description","drug_name"])#sort
    npi_drug_cross_df = npi_drug_cross_df.fillna(0) #Replaces NaN w/ 0
    final_cross_df = pd.concat([final_cross_df, npi_drug_cross_df])
    skip +=1000000
    print(skip) #every print is 1 iteration

2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000


In [6]:
final_cross_df.shape

(950066, 2080)

Grabbing Data From the Prescriber Summary Dataset

Manipulation of Prescriber Summary Data (Database 2) Year: 2017

In [7]:
ps_df = pd.read_csv("data\Prescriber_Info.txt", sep = '\t', low_memory = False)
#another large Dataset

In [8]:
ps_df = ps_df.sort_values(by=["npi","specialty_description"]) #sort

In [9]:
ps_df = ps_df.set_index("npi")

In [10]:
ps_df = ps_df.fillna(0) #if the column doesn't normally take integer/float input, it was still replaced by 0.0
#doesn't really matter because those columns will be dropped

In [11]:
ps_df.head(10)

Unnamed: 0_level_0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_zip5,...,beneficiary_male_count,beneficiary_race_white_count,beneficiary_race_black_count,beneficiary_race_asian_pi_count,beneficiary_race_hispanic_count,beneficiary_race_nat_ind_count,beneficiary_race_other_count,beneficiary_nondual_count,beneficiary_dual_count,beneficiary_average_risk_score
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003000126,ENKESHAFI,ARDALAN,0,M.D.,M,I,900 SETON DR,0,CUMBERLAND,21502.0,...,92.0,220.0,14.0,0.0,0.0,0.0,0.0,143.0,91.0,2.1685
1003000142,KHALIL,RASHID,0,M.D.,M,I,4126 N HOLLAND SYLVANIA RD,SUITE 220,TOLEDO,43623.0,...,92.0,195.0,58.0,0.0,0.0,0.0,0.0,143.0,133.0,1.8029
1003000167,ESCOBAR,JULIO,E,DDS,M,I,5 PINE CONE RD,0,DAYTON,89403.0,...,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0598
1003000175,REYES-VASQUEZ,BELINDA,0,D.D.S.,F,I,322 N AZUSA AVE STE 202,0,LA PUENTE,91744.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1003000282,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,37243.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5148
1003000407,GIRARDI,DAVID,J,D.O.,M,I,100 HOSPITAL RD,0,BROOKVILLE,15825.0,...,31.0,117.0,0.0,0.0,0.0,0.0,0.0,71.0,46.0,1.8494
1003000423,VELOTTA,JENNIFER,A,M.D.,F,I,11100 EUCLID AVE,0,CLEVELAND,44106.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,17.0,0.788
1003000480,ROTHCHILD,KEVIN,B,MD,M,I,12605 E 16TH AVE,0,AURORA,80045.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8413
1003000522,WEIGAND,FREDERICK,J,MD,M,I,1565 SAXON BLVD,SUITE 102,DELTONA,32725.0,...,203.0,393.0,0.0,0.0,38.0,0.0,0.0,402.0,57.0,1.2057
1003000530,SEMONCHE,AMANDA,M,DO,F,I,1021 PARK AVE,SUITE 203,QUAKERTOWN,18951.0,...,179.0,437.0,0.0,0.0,0.0,0.0,0.0,391.0,70.0,1.2281


In [12]:
ps_df_new = ps_df[["specialty_description", "brand_claim_count", "brand_drug_cost", "generic_claim_count", "generic_drug_cost", "nppes_provider_city", "nppes_provider_zip5", "total_claim_count", "total_drug_cost"]] #taking necessary columns only

Concatnate subsections Togethor

In [22]:
ps_df_new.shape

(1162898, 9)

In [23]:
ps_dd_df = pd.concat([final_cross_df.iloc[0:200000],ps_df_new ], axis=1, join='inner')

In [None]:
#ps_dd_df= pd.concat([ps_df_new, final_cross_df], axis = 1, join="inner")

In [24]:
ps_dd_df.shape

(200000, 2089)

In [25]:
ps_dd_df.head()

Unnamed: 0_level_0,1ST TIER UNIFINE PENTIPS,1ST TIER UNIFINE PENTIPS PLUS,ABACAVIR,ABACAVIR-LAMIVUDINE,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABELCET,ABILIFY,ABILIFY MAINTENA,ACAMPROSATE CALCIUM,ACARBOSE,...,ZYTIGA,specialty_description,brand_claim_count,brand_drug_cost,generic_claim_count,generic_drug_cost,nppes_provider_city,nppes_provider_zip5,total_claim_count,total_drug_cost
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003000126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Internal Medicine,0.0,0.0,552.0,5941.53,CUMBERLAND,21502.0,677,32639.57
1003000142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Anesthesiology,235.0,106048.28,1711.0,34140.73,TOLEDO,43623.0,1946,140189.01
1003000167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Dentist,0.0,0.0,0.0,0.0,DAYTON,89403.0,55,302.01
1003000282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Nurse Practitioner,28.0,6576.28,62.0,984.93,NASHVILLE,37243.0,90,7561.21
1003000407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Family Practice,0.0,0.0,1919.0,36600.82,BROOKVILLE,15825.0,2788,108601.73


Secondary DataSet of Providers Grouped by Specialty

In [34]:
specialty_df = ps_dd_df.groupby("specialty_description").sum()
specialty_df.insert(0, "counts", ps_dd_df.groupby("specialty_description").size()) # creates a count column

In [37]:
specialty_df.insert(2088, "generic_brand_ratio_cost", specialty_df["brand_drug_cost"]/specialty_df["generic_drug_cost"], True)

In [38]:
specialty_df.reset_index("specialty_description", inplace=True)
specialty_df.sort_values("generic_brand_ratio_cost", inplace = True, na_position = "first")


In [39]:
specialty_df.tail()

Unnamed: 0,specialty_description,counts,1ST TIER UNIFINE PENTIPS,1ST TIER UNIFINE PENTIPS PLUS,ABACAVIR,ABACAVIR-LAMIVUDINE,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABELCET,ABILIFY,ABILIFY MAINTENA,...,ZYPREXA ZYDIS,ZYTIGA,brand_claim_count,brand_drug_cost,generic_claim_count,generic_drug_cost,nppes_provider_zip5,total_claim_count,total_drug_cost,generic_brand_ratio_cost
38,Hematology,148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,55.0,18674.0,117990400.0,53010.0,5239498.29,7551682.0,75728,135705500.0,22.519405
79,Pharmacist,629,0.0,0.0,105.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,29165.0,26037890.0,53406.0,1113744.77,51890821.0,101565,36869030.0,23.378682
20,Critical Care (Intensivists),272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,130390.0,91695900.0,95564.0,3638684.97,13647523.0,228781,95666300.0,25.200286
15,Colon & Rectal Surgery,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,66.0,301547.8,225.0,9049.95,1063050.0,541,315242.2,33.320385
60,"Neuromusculoskeletal Medicine, Sports Medicine",5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,85.0,495407.7,145.0,2238.75,95535.0,230,497646.4,221.287616


In [40]:
ps_dd_df.insert(2088, "generic_brand_ratio_cost", ps_dd_df["brand_drug_cost"]/ps_dd_df["generic_drug_cost"], True)


In [42]:
ps_dd_df.head()

Unnamed: 0_level_0,1ST TIER UNIFINE PENTIPS,1ST TIER UNIFINE PENTIPS PLUS,ABACAVIR,ABACAVIR-LAMIVUDINE,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABELCET,ABILIFY,ABILIFY MAINTENA,ACAMPROSATE CALCIUM,ACARBOSE,...,specialty_description,brand_claim_count,brand_drug_cost,generic_claim_count,generic_drug_cost,nppes_provider_city,nppes_provider_zip5,total_claim_count,generic_brand_ratio_cost,total_drug_cost
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1013968437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Physician Assistant,0.0,0.0,248.0,1489.28,WASHINGTON,20032.0,275,0.0,4296.06
1013968478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Family Practice,2447.0,829763.78,14537.0,283164.05,SPARTANBURG,29307.0,17178,2.930329,1122690.65
1013968486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,General Surgery,0.0,0.0,0.0,0.0,NEW LONDON,3257.0,142,,2311.34
1013968528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Dentist,0.0,0.0,172.0,507.84,MIDLAND,48640.0,184,0.0,573.99
1013968536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Internal Medicine,2635.0,546116.13,9480.0,245128.65,LANSING,48912.0,12163,2.227876,793488.42


Remove any inf, or NaN, or 0 values from Generic Brand Ratio Costs because those lack predictive value and creates a Log transformed value for Generic and Brand Drug costs - We are looking at people with significant amounts of both Generic and Brand name Drugs prescribed

In [43]:
ps_dd_df.replace(np.inf, np.nan, inplace = True) #remove inf
ps_dd_df.dropna( inplace = True)
ps_dd_df.drop(ps_dd_df[ps_dd_df["generic_brand_ratio_cost"]==0].index , inplace = True)
# remove NaN and 0 and inf

In [44]:
ps_dd_df["log_brand_cost"] = ps_dd_df["brand_drug_cost"].apply(lambda x : np.log10(x)) #creates a log of the cost data
ps_dd_df["log_generic_cost"] = ps_dd_df["generic_drug_cost"].apply(lambda x : np.log10(x))

In [45]:
ps_dd_df.head(10)

Unnamed: 0_level_0,1ST TIER UNIFINE PENTIPS,1ST TIER UNIFINE PENTIPS PLUS,ABACAVIR,ABACAVIR-LAMIVUDINE,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABELCET,ABILIFY,ABILIFY MAINTENA,ACAMPROSATE CALCIUM,ACARBOSE,...,brand_drug_cost,generic_claim_count,generic_drug_cost,nppes_provider_city,nppes_provider_zip5,total_claim_count,generic_brand_ratio_cost,total_drug_cost,log_brand_cost,log_generic_cost
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003000142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,106048.28,1711.0,34140.73,TOLEDO,43623.0,1946,3.10621,140189.01,5.025504,4.533273
1003000282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6576.28,62.0,984.93,NASHVILLE,37243.0,90,6.676901,7561.21,3.81798,2.993405
1003000423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7798.28,152.0,6751.57,CLEVELAND,44106.0,200,1.155032,14549.85,3.891999,3.829405
1003000522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,145938.02,4281.0,111188.48,DELTONA,32725.0,4981,1.312528,258258.7,5.164168,5.04606
1003000530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,273094.17,6150.0,103858.19,QUAKERTOWN,18951.0,7165,2.629491,379661.02,5.436312,5.016441
1003000837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,19969.83,87.0,2154.6,SANTA MARIA,93454.0,102,9.268463,22124.43,4.300374,3.333367
1003000902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,294144.2,5856.0,131679.02,LOUISVILLE,40212.0,7054,2.233797,428096.87,5.46856,5.119517
1003000936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,222148.38,2080.0,43950.19,COLUMBIA,29203.0,2573,5.054549,266098.57,5.346643,4.642961
1003001017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9090.95,448.0,22250.89,LA PALMA,90623.0,563,0.408566,31341.84,3.958609,4.347347
1003001132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,956.3,99.0,1649.05,SAN LUIS OBISPO,93409.0,116,0.57991,2605.35,2.980594,3.217234


In [46]:
specialty_df.replace(np.inf, np.nan, inplace = True)
specialty_df.dropna(inplace =True)
specialty_df.drop(specialty_df[specialty_df["generic_brand_ratio_cost"]==0].index , inplace = True)
#remove NaN and 0 and inf
specialty_df.head(10)

Unnamed: 0,specialty_description,counts,1ST TIER UNIFINE PENTIPS,1ST TIER UNIFINE PENTIPS PLUS,ABACAVIR,ABACAVIR-LAMIVUDINE,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABELCET,ABILIFY,ABILIFY MAINTENA,...,ZYPREXA ZYDIS,ZYTIGA,brand_claim_count,brand_drug_cost,generic_claim_count,generic_drug_cost,nppes_provider_zip5,total_claim_count,total_drug_cost,generic_brand_ratio_cost
51,Maxillofacial Surgery,175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,205.0,4613.24,25571.0,159812.41,6383263.0,48715,363979.69,0.028867
102,Speech Language Pathologist,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,135.0,881.4,255.0,11481.4,432195.0,390,12362.8,0.076768
21,Dentist,21838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5346.0,994492.81,767590.0,5380959.06,1091100000.0,1914652,17941015.2,0.184817
2,Advanced Heart Failure and Transplant Cardiology,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,126.0,25547.58,2106.0,137339.4,995711.0,3301,296155.43,0.186018
91,Psychoanalyst,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,84.0,1841.94,666.0,8889.66,120912.0,750,10731.6,0.2072
53,"Medical Genetics, Ph.D. Medical Genetics",10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,265.0,2000.9,880.0,8531.75,167190.0,1285,37216.7,0.234524
48,Interventional Radiology,126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,779.0,295366.92,40737.0,1242042.42,5774627.0,53199,4917702.2,0.237807
70,Orthopaedic Surgery,234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4127.0,1802032.61,22260.0,5145060.14,12038840.0,35700,7384127.91,0.350245
69,Oral Surgery (Dentist only),1179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1199.0,351666.64,111813.0,930565.04,62352490.0,309719,2619857.46,0.377907
82,Physical Therapist in Private Practice,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1344.0,349566.66,17179.0,778931.95,1306780.0,21241,1872433.09,0.448777


In [47]:
specialty_df["log_brand_cost"] = specialty_df["brand_drug_cost"].apply(lambda x : np.log10(x)) #creates a log of the cost data
specialty_df["log_generic_cost"] = specialty_df["generic_drug_cost"].apply(lambda x : np.log10(x))

In [48]:
specialty_df["Index"] = range(len(specialty_df))
specialty_df = specialty_df.set_index(specialty_df["Index"]) # resets index w/o moving rows

In [49]:
specialty_df.head(10)

Unnamed: 0_level_0,specialty_description,counts,1ST TIER UNIFINE PENTIPS,1ST TIER UNIFINE PENTIPS PLUS,ABACAVIR,ABACAVIR-LAMIVUDINE,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABELCET,ABILIFY,ABILIFY MAINTENA,...,brand_drug_cost,generic_claim_count,generic_drug_cost,nppes_provider_zip5,total_claim_count,total_drug_cost,generic_brand_ratio_cost,log_brand_cost,log_generic_cost,Index
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Maxillofacial Surgery,175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4613.24,25571.0,159812.41,6383263.0,48715,363979.69,0.028867,3.664006,5.203611,0
1,Speech Language Pathologist,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,881.4,255.0,11481.4,432195.0,390,12362.8,0.076768,2.945173,4.059995,1
2,Dentist,21838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,994492.81,767590.0,5380959.06,1091100000.0,1914652,17941015.2,0.184817,5.997602,6.73086,2
3,Advanced Heart Failure and Transplant Cardiology,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,25547.58,2106.0,137339.4,995711.0,3301,296155.43,0.186018,4.40735,5.137795,3
4,Psychoanalyst,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1841.94,666.0,8889.66,120912.0,750,10731.6,0.2072,3.265275,3.948885,4
5,"Medical Genetics, Ph.D. Medical Genetics",10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2000.9,880.0,8531.75,167190.0,1285,37216.7,0.234524,3.301225,3.931038,5
6,Interventional Radiology,126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,295366.92,40737.0,1242042.42,5774627.0,53199,4917702.2,0.237807,5.470362,6.094136,6
7,Orthopaedic Surgery,234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1802032.61,22260.0,5145060.14,12038840.0,35700,7384127.91,0.350245,6.255763,6.71139,7
8,Oral Surgery (Dentist only),1179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,351666.64,111813.0,930565.04,62352490.0,309719,2619857.46,0.377907,5.546131,5.968747,8
9,Physical Therapist in Private Practice,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,349566.66,17179.0,778931.95,1306780.0,21241,1872433.09,0.448777,5.54353,5.8915,9


Saves Data to CSV

In [50]:
ps_dd_df.to_csv("data\ps_dd_df.csv")

In [51]:
specialty_df.to_csv("data\specialty_df.csv", index =False)

(101, 2093)