In [197]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [183]:
# need to upload csv to jupyter files section first
df = pd.read_csv("/content/pricing.csv") #importing data

#print(len(df)) #9496 rows at start

#### **Feature Trimming**
---

In [184]:
#Feature Trimming
df = df.drop(columns=['Unnamed: 0', 'name', 'url', 'street_address', 'zip_code', 'publish_date'])
  #left in city, since without unique codes, there's only 4 features

df = df.drop('code', axis=1) #dropping code for now, since it's redundant with just 84206

df.head()

Unnamed: 0,npi_number,payer,price,city,state
0,1003281452,Aetna,173.0,Henderson,NV
1,1003281452,Amerigroup,24.0,Henderson,NV
2,1003281452,Blue Cross,158.0,Henderson,NV
3,1003281452,CASH,908.0,Henderson,NV
4,1003281452,Cigna,391.0,Henderson,NV


#### **Feature Cleaning** *~ npi_number*
---

In [185]:
# Feature Cleaning ~ npi_number

unqNPIs = df.npi_number.unique() #gathering unique npi's
toFix = [] #list for storing npi's that need to be cleaned

for npi in unqNPIs: #gathering npi's needing cleaning
  if '-' in npi:
    toFix.append(npi)

for i in range(len(toFix)): #for each messy npi
  ogNPI = toFix[i]
  dashInd = ogNPI.index('-')
  cleanNPI = ogNPI[0:dashInd] #get the substring from the front to before the dash

  toChange = df.index[df['npi_number']==ogNPI].tolist() #get index of rows where npi matches original
  for j in toChange:
    df.iat[j,0] = cleanNPI #set each value to cleaned npi

df['npi_number'] = df['npi_number'].astype(int) #changing from string to int
df.head()

Unnamed: 0,npi_number,payer,price,city,state
0,1003281452,Aetna,173.0,Henderson,NV
1,1003281452,Amerigroup,24.0,Henderson,NV
2,1003281452,Blue Cross,158.0,Henderson,NV
3,1003281452,CASH,908.0,Henderson,NV
4,1003281452,Cigna,391.0,Henderson,NV


In [186]:
print(df.dtypes)

npi_number      int64
payer          object
price         float64
city           object
state          object
dtype: object


#### **Feature Cleaning** *~ state and city*
---

In [187]:
LState = df.index[df['state']=='L '].tolist() #fixing Alabama from 'L ' to 'AL'
for entry in LState:
  df.iat[entry,4] = 'AL'

#unqStates = df.state.unique()
#print(unqStates)

nullStates = df[df['state'].isnull()].index.tolist() #states = 629 null, cities = 629 null (same)
df = df.drop(index=nullStates)
  # better to drop, since we can't pull the city and state without looking up the hospital ourselves

unqCities = df.city.unique() #duplicates with different casing
#print(len(unqCities)) #477

df['city'] = df['city'].str.lower()
unqCities = df.city.unique()
#print(len(unqCities)) #448

for city in unqCities: #some cities have blank space in index 0
  if(city[0] == ' '):
    changeCity = df.index[df['city']==city].tolist()
    for c in changeCity:
      df.iat[c,3] = city[1:] #remove blank space

#unqCities = df.city.unique()
#print(len(unqCities)) #453

#### **Ordinal Encoding** *~ payer, city, state*


---



In [195]:
enc = OrdinalEncoder(dtype=int) #integer ordinal encoding, default is float
enc.fit(df[['payer','city', 'state']])
df[['payer','city', 'state']] = enc.transform(df[['payer','city', 'state']])
df.head()

Unnamed: 0,npi_number,payer,price,city,state
0,1003281452,332,173.0,169,31
1,1003281452,350,24.0,169,31
2,1003281452,736,158.0,169,31
3,1003281452,785,908.0,169,31
4,1003281452,1086,391.0,169,31


In [196]:
print(df.dtypes)

npi_number      int64
payer           int64
price         float64
city            int64
state           int64
dtype: object


In [201]:
df = df[['npi_number', 'payer', 'city', 'state', 'price']] #reordering columns for easier split later
df.head()

Unnamed: 0,npi_number,payer,city,state,price
0,1003281452,332,169,31,173.0
1,1003281452,350,169,31,24.0
2,1003281452,736,169,31,158.0
3,1003281452,785,169,31,908.0
4,1003281452,1086,169,31,391.0


#### **Standardization / Normalization**
---
Since the data values range greatly between features, standardizing makes it easier for the model to learn.


In [206]:
scaler = StandardScaler() # default centers mean to zero & scales to unit variance
normNumpy = scaler.fit_transform(df) #returns as numpy array

#### **Train Test Split (80-20)**


---



In [215]:
npTrain, npTest = train_test_split(normNumpy, test_size=0.2)

#convert back to pandas
pdTrain = pd.DataFrame(npTrain, columns=['npi_number', 'payer', 'city', 'state', 'price'])
pdTest = pd.DataFrame(npTest, columns=['npi_number', 'payer', 'city', 'state', 'price'])
#pdTrain.head()

In [221]:
yTrain = pdTrain[['price']].copy()
#yTrain.head()
yTest = pdTest[['price']].copy()

In [219]:
xTrain = pdTrain.drop('price', axis=1)
#xTrain.head()
xTest = pdTest.drop('price', axis=1)

#### **Downloading Files**


---



In [222]:
xTrain.to_csv("xTrain.csv", index = False)
yTrain.to_csv("yTrain.csv", index = False)

xTest.to_csv("xTest.csv", index = False)
yTest.to_csv("yTest.csv", index = False)