# Forest of Doom
### By Ryan Dickson
### Edited by Mohinder Dick
> References go here for presentation version. 

In [None]:
from pyspark.sql.functions import *
from dateutil.parser import parse
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import StringIndexer, HashingTF, VectorAssembler, IDF
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator

## Verify Spark

This launches the notebook and provides the spark context in the variable *sc*. You can use the context to preview the configuration.

In [None]:
print sc._conf.getAll()
test = sc.parallelize(range(1000))
print '\nsum of 1 to 1000: ', test.reduce(lambda a, b: a+b )

## Get the training data

In [None]:
if sc._conf.get('spark.master') == 'yarn-client' or sc._conf.get('spark.master') == 'yarn-cluster':
    urn = 'hdfs://sparkdl04:8020/palooza/data/visit_train_panda.csv'
else:
    urn = 'file:///C:/Users/dickm/Documents/Projects/ML/Source/UPMC/Pharmacy/visit_train_panda.csv'

chargesRDD = sc.textFile(urn)

#Get a new RDD with map function and lambda keyword. Remove header row.
header = chargesRDD.take(1)[0]
chargesRDD = chargesRDD.filter(lambda line: line!=header)
chargesRDDSplit = chargesRDD.map(lambda line: line.replace('"', '').split(','))
chargesRDDSplit.cache()

## Get the test data

Mo knows where it's at!

In [None]:
if sc._conf.get('spark.master') == 'yarn-client' or sc._conf.get('spark.master') == 'yarn-cluster':
    urnTest = 'hdfs://sparkdl04:8020/palooza/data/validate/visit_test_panda.csv'
else:
    urnTest = 'file:///C:/Users/dickm/Documents/Projects/ML/Source/UPMC/Pharmacy/visit_test_panda.csv'

chargesRDDTEST = sc.textFile(urnTest)
   
chargesRDDTEST = chargesRDD.filter(lambda line: line!=header)
chargesRDDSplitTEST = chargesRDD.map(lambda line: line.replace('"', '').split(','))
chargesRDDSplitTEST.cache()

In [None]:
#look at the data
print 'The number of training records is ', chargesRDD.count()

### Raw Features

* VisitID - Identifier for patient visit.
* Hospital - Admitting hospital.
* Dept_Code - department code.
* PaymentType - I am guessing a payment type for visit.
* Age - Age of the patient in years.
* Race - De-identified race of the patient.
* Gender - Gender ("M" - male, "F" - female)
* FC - ?
* ArriveDate - Date of admission. 
* DischargeDate - Date of discharge
* LOS - length of patient stay in days.
* DXCODE - Diagnosis code.
* Description - Description of diagnosis
* DispenseID - ?
* DOC - ?

## Build Random Forest Model
### We want to predict the length of stay (LOS) given the patient demographics, dxcode and deptcode, and day of the week admitted.

Assumptions of feature relevance...
* Dept_code categorical, some departments would have more serious patients than others
* Day of the week, Patients admitted over weekend may require a longer length of stay to be seen by necessary staff
* Dxcode, multiple per patient, may need to use PCA to reduce number (PCA is not always good before random forest, see references)
* Demographics: age, gender and race 


We go through the following pipeline:
* Encode/Extract features
* Train the model
* Evaluate the model on unseen data
* Draw conclusions and make recommendations

We should not need to normalize the features for Random Forest, but bucketing may be needed to help with outliers

In [None]:
dxcodeRDD = chargesRDDSplit.map(lambda line: (line[0], line[11])).groupByKey().distinct().cache()
dxCodes = dxcodeRDD.values().flatMap(list).distinct()
dxCodeCount = dxCodes.count()

In [None]:
def merge(x, y):
    
    if x is not None: 
        x['dxCount'] += 1
        if 'dxcode' in x:
            x['dxcode'] = list(set(x['dxcode'] + y['dxcode']))
        else:
            x = y
    else:
        x = y
    return x


def mapAndFold(rdd):
    return rdd.map(lambda line: (line[0], dict(
             los=float(line[10]),
             age=int(line[4]),
             hospital_visit=line[1],
             dept_code=line[2],
             race=line[5], 
             gender_female=1 if line[6]=='F' else 0,    #Encode gender as boolean
             dxcode=[line[11]],
             admit_day=parse(line[8]).weekday(),
             admit_month=parse(line[8]).month,
             dxCount=1,
             fc=line[7]
            ))).foldByKey(None, merge)


In [None]:
chargesByVisitRDD = mapAndFold(chargesRDDSplit)
df = chargesByVisitRDD.values().toDF()

In [None]:
chargesByVisitRDDTEST = mapAndFold(chargesRDDSplitTEST)
dfTEST = chargesByVisitRDD.values().toDF()

In [None]:
stringIndexers = [
    StringIndexer(inputCol="hospital_visit", outputCol="hospitalIndex"),
    StringIndexer(inputCol="dept_code", outputCol="deptIndex"),
    StringIndexer(inputCol="race", outputCol="raceIndex")
]

hashingTF = HashingTF(numFeatures=2*dxCodeCount, inputCol="dxcode", outputCol="dxCodes")
idf = IDF(inputCol="dxCodes", outputCol="idfDxCodes", minDocFreq=10)

mungePipeline = Pipeline(stages=stringIndexers + [hashingTF, idf])

mungingModel = mungePipeline.fit(df)
trainingData = mungingModel.transform(df)

In [None]:
testingData = mungingModel.transform(dfTEST)

## Munge it!
String Indexers Need to have all values, so will need to fit combined traing and test data if unseen labels are present


In [None]:
assembler = VectorAssembler(
   inputCols=["age", 
              "deptIndex", 
              "gender_female",
              "raceIndex",
              "hospitalIndex",
              "admit_day",
              "admit_month",
              "dxCount",
              "idfDxCodes"
             ],
    outputCol="features")

transformedTrainingDF = assembler.transform(trainingData).select('features','los')

In [None]:
transformedTestingDF = assembler.transform(testingData).select('features','los')

### Train Model
Now we generate the training and test data. We use the ***seed*** function to ensure a repeatable split of the data between runs.

In [None]:
rf = RandomForestRegressor(featuresCol="features", labelCol="los",maxBins=1000, seed=1234)

#Magic Numbers?
rf.setNumTrees(100) 
rf.setMaxDepth(10) # Max of Spark is 30
rf.setMinInstancesPerNode(5)
rf.setFeatureSubsetStrategy('all')

model = rf.fit(transformedTrainingDF)

### Evaluate Model
We evaluate the model on the unseen dataset that was not used to train the model. For reference here is the stats for los in the training data

In [None]:
df.describe('los').show()

In [None]:
# Select (prediction, true label) and compute test error
predictions = model.transform(transformedTestingDF).select('los','prediction')
evaluator = RegressionEvaluator(
    labelCol="los", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)