Creating Multiple Datasets and Manipulating Data

In [48]:
import pandas as pd
import numpy as np

In [49]:
pd.set_option("display.max_columns", 150) #increase column options

Grabbing Data From the Prescriber Summary Dataset

Manipulation of Prescriber Summary Data (Database 2) Year: 2017

In [50]:
ps_df = pd.read_csv("data\Prescriber_Info.txt", sep = '\t', low_memory = False)
#large Dataset

In [51]:
ps_df = ps_df.sort_values(by=["npi","specialty_description"]) 
#sort by NPI then Specialty

In [52]:
ps_df = ps_df.set_index("npi")

In [53]:
ps_df = ps_df.fillna(0) 
#if the column doesn't normally take integer/float input, it was
#still replaced by 0.0
#doesn't really matter because those columns will be dropped

In [54]:
ps_df = ps_df[["specialty_description","nppes_provider_city", "nppes_provider_state", "nppes_provider_zip5","total_claim_count", "brand_claim_count", "generic_claim_count", "brand_drug_cost", "generic_drug_cost", "total_drug_cost", "total_30_day_fill_count", "total_day_supply" ,"bene_count" ]]
#taking necessary columns only
#Removed: "medicare_prvdr_enroll_status"  "beneficiary_average_risk_score" "average_age_of_beneficiaries" "other_claim_count", "mapd_claim_count", "pdp_claim_count", "lis_claim_count","nonlis_claim_count", "opioid_claim_count", "la_opioid_claim_count", "antibiotic_claim_count", "antipsych_claim_count_ge65""other_drug_cost", "pdp_drug_cost", "mapd_drug_cost", "lis_drug_cost" ,"nonlis_drug_cost", "opioid_drug_cost", "la_opioid_drug_cost", "antibiotic_drug_cost", "antipsych_drug_cost_ge65"

In [55]:
ps_df.head()

Unnamed: 0_level_0,specialty_description,nppes_provider_city,nppes_provider_state,nppes_provider_zip5,total_claim_count,brand_claim_count,generic_claim_count,brand_drug_cost,generic_drug_cost,total_drug_cost,total_30_day_fill_count,total_day_supply,bene_count
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1003000126,Internal Medicine,CUMBERLAND,MD,21502.0,677,0.0,552.0,0.0,5941.53,32639.57,695.5,14788,234.0
1003000142,Anesthesiology,TOLEDO,OH,43623.0,1946,235.0,1711.0,106048.28,34140.73,140189.01,2054.8,58605,276.0
1003000167,Dentist,DAYTON,NV,89403.0,55,0.0,0.0,0.0,0.0,302.01,55.0,430,33.0
1003000175,Dentist,LA PUENTE,CA,91744.0,18,0.0,18.0,0.0,113.81,113.81,18.0,150,0.0
1003000282,Nurse Practitioner,NASHVILLE,TN,37243.0,90,28.0,62.0,6576.28,984.93,7561.21,110.0,2681,11.0


Secondary DataSets of Providers

Specialty_region_df : grouped by state

Specialty_city_df : grouped by city

Specialty_df : grouped by specialty

city_specialty_agg : grouped by specialties in each city

Note: Disregard Zip Codes

In [56]:
specialty_region_df = ps_df.groupby(["nppes_provider_state"]).sum()
specialty_region_df.insert(0, "counts", ps_df.groupby(["nppes_provider_state"]).size()) # creates a count column

In [57]:
specialty_city_df = ps_df.groupby(["nppes_provider_city"]).sum()
specialty_city_df.insert(0, "counts", ps_df.groupby(["nppes_provider_city"]).size()) # creates a count column

In [58]:
specialty_df = ps_df.groupby("specialty_description").sum()
specialty_df.insert(0, "counts", ps_df.groupby( "specialty_description").size()) # creates a count column

In [59]:
city_specialty_agg = ps_df.groupby(["nppes_provider_city", "specialty_description"]).sum()
city_specialty_agg.insert(0, "counts", ps_df.groupby([ "nppes_provider_city","specialty_description"]).size()) # creates a count column

Adding a new Column of Brand to Generic Spending as Brand/(Generic+Brand)

In [60]:
specialty_df["brand_percent_spending"]= specialty_df["brand_drug_cost"]/(specialty_df["generic_drug_cost"]+specialty_df["brand_drug_cost"])
specialty_region_df["brand_percent_spending"]= specialty_region_df["brand_drug_cost"]/(specialty_region_df["generic_drug_cost"]+specialty_region_df["brand_drug_cost"])
specialty_city_df["brand_percent_spending"]= specialty_city_df["brand_drug_cost"]/(specialty_city_df["generic_drug_cost"]+specialty_city_df["brand_drug_cost"])
city_specialty_agg["brand_percent_spending"]= city_specialty_agg["brand_drug_cost"]/(city_specialty_agg["generic_drug_cost"]+city_specialty_agg["brand_drug_cost"])

In [61]:
ps_df["brand_percent_spending"]= ps_df["brand_drug_cost"]/(ps_df["generic_drug_cost"] + ps_df["brand_drug_cost"])
#needs to come after

Remove any inf, or NaN, or 0 values from Generic Brand Ratio Costs because those lack predictive value and creates a Log transformed value for Generic and Brand Drug costs

We are looking at people with significant amounts of both Generic and Brand name Drugs prescribed

In [62]:
ps_df.replace(np.inf, np.nan, inplace = True) #remove inf
ps_df.dropna( inplace = True) # remove NaN
ps_df.drop(ps_df[ps_df["brand_percent_spending"]==0].index , inplace = True)
#removes 0 values
ps_df.drop(ps_df[ps_df["brand_percent_spending"]==1].index , inplace = True)
#removes no generic drugs
ps_df["log_brand_cost"] = ps_df["brand_drug_cost"].apply(lambda x : np.log10(x+1)) 
ps_df["log_generic_cost"] = ps_df["generic_drug_cost"].apply(lambda x : np.log10(x+1))
#creates a log of the cost data
#ps_df.head(10)

In [63]:
specialty_df.replace(np.inf, np.nan, inplace = True)
specialty_df.dropna(inplace =True)
specialty_df.drop(specialty_df[specialty_df["brand_percent_spending"]==0].index , inplace = True)
specialty_df.drop(specialty_df[specialty_df["brand_percent_spending"]==1].index , inplace = True)
#remove NaN and 0 and inf
specialty_df["log_brand_cost"] = specialty_df["brand_drug_cost"].apply(lambda x : np.log10(x+1))
specialty_df["log_generic_cost"] = specialty_df["generic_drug_cost"].apply(lambda x : np.log10(x+1))
#creates a log of the cost data
#specialty_df.head(!0)

In [64]:
specialty_region_df.replace(np.inf, np.nan, inplace = True)
specialty_region_df.dropna(inplace =True)
specialty_region_df.drop(specialty_region_df[specialty_region_df["brand_percent_spending"]==0].index , inplace = True)
specialty_region_df.drop(specialty_region_df[specialty_region_df["brand_percent_spending"]==1].index , inplace = True)
#remove NaN and 0 and inf
specialty_region_df["log_brand_cost"] = specialty_region_df["brand_drug_cost"].apply(lambda x : np.log10(x+1)) 
specialty_region_df["log_generic_cost"] = specialty_region_df["generic_drug_cost"].apply(lambda x : np.log10(x+1))
#creates a log of the cost data
#specialty_region.head(10)

In [65]:
specialty_city_df.replace(np.inf, np.nan, inplace = True)
specialty_city_df.dropna(inplace =True)
specialty_city_df.drop(specialty_city_df[specialty_city_df["brand_percent_spending"]==0].index , inplace = True)
specialty_city_df.drop(specialty_city_df[specialty_city_df["brand_percent_spending"]==1].index , inplace = True)
#remove NaN and 0 and inf
#specialty_city_df.head(10)

In [66]:
city_specialty_agg.replace(np.inf, np.nan, inplace = True)
city_specialty_agg.dropna(inplace =True)
city_specialty_agg.drop(city_specialty_agg[city_specialty_agg["brand_percent_spending"]==0].index , inplace = True)
city_specialty_agg.drop(city_specialty_agg[city_specialty_agg["brand_percent_spending"]==1].index , inplace = True)
#remove NaN and 0 and inf
#city_specialty_agg.head(10)

Sorting Code By Brand to Generic Ratio

In [67]:

specialty_df.sort_values("brand_percent_spending", inplace = True, na_position = "first")
specialty_city_df.sort_values("brand_percent_spending", inplace = True)
city_specialty_agg.sort_values("brand_percent_spending", inplace = True)
#used to create indexer

Indexing Cities and Specialties

In [68]:
specialty_city_df["Index"] = range(len(specialty_city_df))
city_indexer = specialty_city_df["Index"]
city_dict = city_indexer.to_dict()
#city index

In [69]:
specialty_df["Index"] = range(len(specialty_df))
specialty_indexer = specialty_df["Index"]
specialty_dict = specialty_indexer.to_dict()
#specialty_index

UNUSED INPUTS

Medicare Enrollment Status Mapping

In [70]:
#set(ps_df["medicare_prvdr_enroll_status"].tolist())

In [71]:
#ps_df["medicare_prvdr_enroll_status"] = ps_df["medicare_prvdr_enroll_status"].map({"E": 1, "N":0, "O":2 })

Create a ML dataset

In [72]:
ML_dataset = ps_df[["specialty_description", "nppes_provider_city" ,"total_claim_count", "total_drug_cost","total_day_supply", "total_30_day_fill_count", "bene_count", "brand_percent_spending"]]
#inputs for the dataset

In [73]:
ML_dataset["specialty_description"] = ML_dataset["specialty_description"].map(specialty_dict)
ML_dataset["nppes_provider_city"] = ML_dataset["nppes_provider_city"].map(city_dict)

#mapping specialty and city to proper index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [74]:
ML_dataset.head()

Unnamed: 0_level_0,specialty_description,nppes_provider_city,total_claim_count,total_drug_cost,total_day_supply,total_30_day_fill_count,bene_count,brand_percent_spending
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1003000142,39,9174,1946,140189.01,58605,2054.8,276.0,0.756466
1003000282,108,9331,90,7561.21,2681,110.0,11.0,0.869739
1003000423,32,8930,200,14549.85,7587,307.2,65.0,0.53597
1003000522,70,3003,4981,258258.7,337309,11494.1,459.0,0.567573
1003000530,88,7343,7165,379661.02,333309,11406.5,461.0,0.724479


Test For NaN values

In [75]:
ML_dataset.dtypes

specialty_description        int64
nppes_provider_city          int64
total_claim_count            int64
total_drug_cost            float64
total_day_supply             int64
total_30_day_fill_count    float64
bene_count                 float64
brand_percent_spending     float64
dtype: object

In [76]:
ML_dataset.bene_count = ML_dataset.bene_count.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [77]:
np.where(ML_dataset.values >= np.finfo(np.float64).max)

(array([], dtype=int64), array([], dtype=int64))

In [78]:
np.where(ML_dataset.values <= np.finfo(np.float64).min)

(array([], dtype=int64), array([], dtype=int64))

In [79]:
ML_dataset[ML_dataset["brand_percent_spending"]==np.nan]# was filled so NaN became 0

Unnamed: 0_level_0,specialty_description,nppes_provider_city,total_claim_count,total_drug_cost,total_day_supply,total_30_day_fill_count,bene_count,brand_percent_spending
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [80]:
ML_dataset[ML_dataset["brand_percent_spending"]==np.inf]

Unnamed: 0_level_0,specialty_description,nppes_provider_city,total_claim_count,total_drug_cost,total_day_supply,total_30_day_fill_count,bene_count,brand_percent_spending
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


ML Dataset Aggregated by City and Specialty

In [81]:
city_specialty_agg.reset_index(["specialty_description", "nppes_provider_city"], inplace = True)

In [82]:
city_specialty_agg = city_specialty_agg[["specialty_description", "nppes_provider_city", "brand_percent_spending","total_claim_count", "total_drug_cost"]]
#inputs for Agg ML Dataset

In [83]:
city_specialty_agg["specialty_index"] = city_specialty_agg["specialty_description"].map(specialty_dict)
city_specialty_agg["city_index"] = city_specialty_agg["nppes_provider_city"].map(city_dict)
#mapping specialty and city to proper index

Manipulation Index of Datasets

In [84]:
specialty_df.reset_index("specialty_description", inplace=True)
specialty_df = specialty_df.set_index(specialty_df["Index"],drop = True)
#resets index w/o moving rows

In [85]:
specialty_city_df.reset_index("nppes_provider_city", inplace = True)

Saves Data to CSV

In [86]:
ps_df.to_csv("data\ps_df.csv")

In [87]:
specialty_df.to_csv("data\specialty_df.csv", index =False)

In [88]:
specialty_region_df.to_csv("data\specialty_region_df.csv")

In [89]:
specialty_city_df.to_csv("data\specialty_city_df.csv", index = False)

In [90]:
ML_dataset.to_csv("data\ML_dataset.csv", index = False)

In [91]:
city_specialty_agg.to_csv("data\ML_dataset_2.csv", index = False)

Metrics of DataFrame

In [92]:
#metrics_df = pd.DataFrame(columns = ["Input","R Squared", "Mean Absolute Error", "Mean Squared Error", "Keep Metric"])

In [93]:
#metrics_df.to_csv("data\metrics_df.csv", index = False)

In [94]:
ML_dataset.head()

Unnamed: 0_level_0,specialty_description,nppes_provider_city,total_claim_count,total_drug_cost,total_day_supply,total_30_day_fill_count,bene_count,brand_percent_spending
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1003000142,39,9174,1946,140189.01,58605,2054.8,276,0.756466
1003000282,108,9331,90,7561.21,2681,110.0,11,0.869739
1003000423,32,8930,200,14549.85,7587,307.2,65,0.53597
1003000522,70,3003,4981,258258.7,337309,11494.1,459,0.567573
1003000530,88,7343,7165,379661.02,333309,11406.5,461,0.724479


In [102]:
def colfix(df, L=5): return df.rename(columns=lambda x: x.replace('_', ' '))

colfix(ML_dataset)

Unnamed: 0_level_0,specialty description,nppes provider city,total claim count,total drug cost,total day supply,total 30 day fill count,bene count,brand percent spending
npi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1003000142,39,9174,1946,140189.01,58605,2054.8,276,0.756466
1003000282,108,9331,90,7561.21,2681,110.0,11,0.869739
1003000423,32,8930,200,14549.85,7587,307.2,65,0.535970
1003000522,70,3003,4981,258258.70,337309,11494.1,459,0.567573
1003000530,88,7343,7165,379661.02,333309,11406.5,461,0.724479
1003000837,108,8444,102,22124.43,4710,160.0,13,0.902614
1003000902,70,8309,7054,428096.87,344274,11939.0,420,0.690766
1003000936,88,8894,2573,266098.57,115667,3897.8,318,0.834835
1003001017,91,9415,563,31341.84,14043,608.1,151,0.290058
1003001132,70,9353,116,2605.35,9055,307.0,29,0.367052
