This notebook processes the data for the code "84206" (84206.csv) and splits it into testing and training data stored at `/data/84206/*.csv`

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# need to upload csv to jupyter files section first
df = pd.read_csv("../data/84206.csv") #importing data

print(len(df)) #9496 rows at start

9496


#### **Feature Trimming**
---

In [3]:
#Feature Trimming
df = df.drop(columns=['name', 'url', 'street_address', 'zip_code', 'publish_date'])
#left in city, since without unique codes, there's only 4 features

#df = df.drop('code', axis=1) #dropping code for now, since it's redundant with just 84206

df.head()

Unnamed: 0,code,npi_number,payer,price,city,state
0,84206,1003281452,Aetna,173.0,Henderson,NV
1,84206,1003281452,Amerigroup,24.0,Henderson,NV
2,84206,1003281452,Blue Cross,158.0,Henderson,NV
3,84206,1003281452,CASH,908.0,Henderson,NV
4,84206,1003281452,Cigna,391.0,Henderson,NV


#### **Feature Cleaning** *~ npi_number*
---

In [4]:
# Feature Cleaning ~ npi_number

unqNPIs = df.npi_number.unique() #gathering unique npi's
toFix = [] #list for storing npi's that need to be cleaned

for npi in unqNPIs: #gathering npi's needing cleaning
  if '-' in npi:
    toFix.append(npi)

for ogNPI in toFix: #for each messy npi
  dashInd = ogNPI.index('-')
  cleanNPI = ogNPI[0:dashInd] #get the substring from the front to before the dash

  toChange = df.index[df['npi_number']==ogNPI].tolist() #get index of rows where npi matches original
  for j in toChange:
    df.iat[j,1] = cleanNPI #set each value to cleaned npi


In [5]:
df['npi_number'] = df['npi_number'].astype(int) #changing from string to int
df.head()

Unnamed: 0,code,npi_number,payer,price,city,state
0,84206,1003281452,Aetna,173.0,Henderson,NV
1,84206,1003281452,Amerigroup,24.0,Henderson,NV
2,84206,1003281452,Blue Cross,158.0,Henderson,NV
3,84206,1003281452,CASH,908.0,Henderson,NV
4,84206,1003281452,Cigna,391.0,Henderson,NV


#### **Feature Cleaning** ~ price

In [6]:
codeCheck = str(df.iloc[0][0]) #pulling code as string
unqPrice = df.price.unique() #gathering unique prices
priceWithCode = []

for p in unqPrice:
  if codeCheck in str(p): #if code is substring of price
    badPrices = df.index[df['price']==p].tolist() #get index of rows with price
    df = df.drop(index=badPrices) #drop rows with bad prices

df = df.drop('code', axis=1) #dropping code for now, since it's redundant with just 84206
df.head()

Unnamed: 0,npi_number,payer,price,city,state
0,1003281452,Aetna,173.0,Henderson,NV
1,1003281452,Amerigroup,24.0,Henderson,NV
2,1003281452,Blue Cross,158.0,Henderson,NV
3,1003281452,CASH,908.0,Henderson,NV
4,1003281452,Cigna,391.0,Henderson,NV


In [7]:
print(df.dtypes)

npi_number      int32
payer          object
price         float64
city           object
state          object
dtype: object


#### **Feature Cleaning** *~ state and city*
---

In [8]:
LState = df.index[df['state']=='L '].tolist() #fixing Alabama from 'L ' to 'AL'
for entry in LState:
  df.iat[entry,4] = 'AL'

#unqStates = df.state.unique()
#print(unqStates)

nullStates = df[df['state'].isnull()].index.tolist() #states = 629 null, cities = 629 null (same)
df = df.drop(index=nullStates)
  # better to drop, since we can't pull the city and state without looking up the hospital ourselves

unqCities = df.city.unique() #duplicates with different casing
#print(len(unqCities)) #477

df['city'] = df['city'].str.lower()
unqCities = df.city.unique()
#print(len(unqCities)) #448

for city in unqCities: #some cities have blank space in index 0
  if(city[0] == ' '):
    changeCity = df.index[df['city']==city].tolist()
    for c in changeCity:
      df.iat[c,3] = city[1:] #remove blank space

# unqCities = df.city.unique()
# print(len(unqCities)) #453

#### **Ordinal Encoding** *~ payer, city, state*


---



In [9]:
enc = OrdinalEncoder(dtype=int) #integer ordinal encoding, default is float
enc.fit(df[['payer','city', 'state']])
df[['payer','city', 'state']] = enc.transform(df[['payer','city', 'state']])
df.head()

Unnamed: 0,npi_number,payer,price,city,state
0,1003281452,332,173.0,169,32
1,1003281452,350,24.0,169,32
2,1003281452,736,158.0,169,32
3,1003281452,785,908.0,169,32
4,1003281452,1086,391.0,169,32


In [10]:
print(df.dtypes)

npi_number      int32
payer           int32
price         float64
city            int32
state           int32
dtype: object


In [11]:
nullPrices = df[df['price'].isnull()].index.tolist()
df = df.drop(index=nullPrices) #dropping rows with null prices

In [12]:
df = df[['npi_number', 'payer', 'city', 'state', 'price']] #reordering columns for easier split later
df.head()

Unnamed: 0,npi_number,payer,city,state,price
0,1003281452,332,169,32,173.0
1,1003281452,350,169,32,24.0
2,1003281452,736,169,32,158.0
3,1003281452,785,169,32,908.0
4,1003281452,1086,169,32,391.0


#### **Standardization / Normalization**
---
Since the data values range greatly between features, standardizing makes it easier for the model to learn.


In [13]:
scaler = StandardScaler() # default centers mean to zero & scales to unit variance
normNumpy = scaler.fit_transform(df) #returns as numpy array

#### **Train Test Split (80-20)**


---



In [14]:
npTrain, npTest = train_test_split(normNumpy, test_size=0.2)

#convert back to pandas
pdTrain = pd.DataFrame(npTrain, columns=['npi_number', 'payer', 'city', 'state', 'price'])
pdTest = pd.DataFrame(npTest, columns=['npi_number', 'payer', 'city', 'state', 'price'])
#pdTrain.head()

In [15]:
yTrain = pdTrain[['price']].copy()
#yTrain.head()
yTest = pdTest[['price']].copy()

In [16]:
xTrain = pdTrain.drop('price', axis=1)
#xTrain.head()
xTest = pdTest.drop('price', axis=1)

#### **Downloading Files**


---



In [17]:
xTrain.to_csv("../data/84206/xTrain.csv", index=False)
yTrain.to_csv("../data/84206/yTrain.csv", index=False)

xTest.to_csv("../data/84206/xTest.csv", index=False)
yTest.to_csv("../data/84206/yTest.csv", index=False)