# Project COVID19 Prediction Modeling with Logistic Regression and Decision Trees 2

## By Zachary Wing

## Initialize Spark and create a session 

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession\
    .builder\
    .appName("CovidDataSets")\
    .getOrCreate()

display("Setup Complete")


'Setup Complete'

## Import Data

We uploaded the entire model data here

In [22]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import RFormula

#
# Read in the data file and show the schema.   This is the file with the 10,000 recrods.  
# ADJUST PATH AND FILE NAME APPROPRIATELY FOR YOUR TESTING 
# 

bInput = spark.read.format("csv")\
.option("header","true")\
.option("inferSchema","true")\
.load("Model_Data.csv")

print("Count of Records in File:  " + str(bInput.count()))
print()
print("SCHEMA:")
bInput.printSchema()
print()
print("DATA FROM CSV FILE:")
bInput.show(20)


Count of Records in File:  2182

SCHEMA:
root
 |-- state: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- Rev_Age: integer (nullable = true)
 |-- Patient_Contact: integer (nullable = true)
 |-- Have_Onset_Date: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- patient_id: long (nullable = true)
 |-- country: string (nullable = true)
 |-- province: string (nullable = true)
 |-- city: string (nullable = true)
 |-- infection_case: string (nullable = true)
 |-- infected_by: long (nullable = true)
 |-- symptom_onset_date: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- confirmed_date: string (nullable = true)
 |-- released_date: string (nullable = true)


DATA FROM CSV FILE:
+--------+------+-------+---------------+---------------+--------+----------+-------+--------+------------+--------------------+-----------+------------------+----+--------------+-------------+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|pati

## Data Selection

We Chose to use the same columns that we used in Project 2 but we also kept 'province' and 'confirmed_date'

In [23]:
covid = bInput.select('state', 'sex', 'Rev_Age', 'Patient_Contact', 'Have_Onset_Date', 'Duration', 'province', 'confirmed_date')


print("Count of Records in File:  " + str(covid.count()))
print()
print("SCHEMA:")
covid.printSchema()
print()
print("DATA FROM CSV FILE:")
covid.show(20)

Count of Records in File:  2182

SCHEMA:
root
 |-- state: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- Rev_Age: integer (nullable = true)
 |-- Patient_Contact: integer (nullable = true)
 |-- Have_Onset_Date: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- province: string (nullable = true)
 |-- confirmed_date: string (nullable = true)


DATA FROM CSV FILE:
+--------+------+-------+---------------+---------------+--------+--------+--------------+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|province|confirmed_date|
+--------+------+-------+---------------+---------------+--------+--------+--------------+
|released|  male|     50|              0|              1|      13|   Seoul|       1/23/20|
|released|  male|     30|              0|              0|      32|   Seoul|       1/30/20|
|released|  male|     50|              1|              0|      20|   Seoul|       1/30/20|
|released|  male|     20|              0|          

## Split Data

In [24]:

#
# This sets up the data to run through the model.   
# The columns "features" and "label" are added to the existing data set.
# These columns provide the data (vectors and label) needed for processing the data
# 

supervised = RFormula(formula="state ~ .")
fittedRF = supervised.fit(covid)
preparedDF = fittedRF.transform(covid)
print()
print("DATA AFTER FITTING:")
preparedDF.show(30)

# 
# Split the data into a Training set (70%) and Test data (30%)
# Set up and run the logistic regression 
# 

trainingData, testData = preparedDF.randomSplit([0.7, 0.3], 5) 




DATA AFTER FITTING:
+--------+------+-------+---------------+---------------+--------+--------+--------------+--------------------+-----+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|province|confirmed_date|            features|label|
+--------+------+-------+---------------+---------------+--------+--------+--------------+--------------------+-----+
|released|  male|     50|              0|              1|      13|   Seoul|       1/23/20|(85,[1,3,4,6,82],...|  1.0|
|released|  male|     30|              0|              0|      32|   Seoul|       1/30/20|(85,[1,4,6,75],[3...|  1.0|
|released|  male|     50|              1|              0|      20|   Seoul|       1/30/20|(85,[1,2,4,6,75],...|  1.0|
|released|  male|     20|              0|              1|      16|   Seoul|       1/30/20|(85,[1,3,4,6,75],...|  1.0|
|released|female|     20|              1|              0|      24|   Seoul|       1/31/20|(85,[0,1,2,4,6,72...|  1.0|
|released|female|     50|          

## LOGISTIC REGRESSION 

In [25]:
lr = LogisticRegression()
lrModel = lr.fit(trainingData)
displaySummary = lrModel.summary

#
# To get the parameters associated with the model, un-comment the "explainParams" below. 
# 
# print(lr.explainParams())

#
# Get the resulting coefficients and Y intercept 
# 

print ("Coefficients = " + str(lrModel.coefficients))
print ("Y intercept = " + str(lrModel.intercept)) 

trainingSummary = lrModel.summary

#
# Obtain the ROC using the "AREA UNDER THE CURVE"
# 
print("")
print("areaUnderROC: " + str(displaySummary.areaUnderROC))
print("")
print('ROC Values')
displaySummary.roc.show()

Coefficients = [0.6399410029122297,-0.026502610142830767,-0.25027579284135754,-1.3815530458879628,-0.8758197453866059,12.38603277702386,32.1073625495697,29.37853438754243,13.647003700722081,37.13405239429356,36.540725054458456,9.954329982025738,18.12704093924284,10.025977413810292,12.860227703273523,25.6951459771725,34.470600184287846,19.724014541023077,12.536718153211323,33.18894393942711,3.134306765652094,32.48399409906083,34.78487918870409,35.66298523256887,35.54711344163335,3.3484749083453615,3.175936314276228,35.188326343046064,37.722663646194796,18.31106670249446,35.84423183352662,36.95022145768533,17.81642885551227,-7.214129454721088,35.16761026794457,0.5759549107195292,37.240671648312514,13.284450339490384,33.12332365238555,-8.021837275863529,15.624423556430612,-6.461836068739912,12.18777441942133,34.411852168266506,-5.740128907094946,34.426747180355726,8.356052200876544,37.27158308644432,-5.496925834055324,8.48517520997022,34.08307900676792,-4.659920371356337,34.38033524933127

### LOGISTIC REGRESSION Cross Validation with TEST Data

Now we will run the Logistic Regression with the Test data set and review the evaluation data. 

In [26]:
# 
# RUN THE TEST DATA 
# 

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import confusion_matrix

testData.show(100)
predictions = lrModel.transform(testData)
testDF = predictions

#
# Create the testDF for output to a CSV file for later comparison. 
#
testDF = testDF.select("state", "sex", "Rev_Age", "Patient_Contact", "Have_Onset_Date", "Duration", "province", 'confirmed_date')

testDF.show()

#
# Create an Evaluator for binary classification, which expects two input columns: rawPrediction and label.
# Evaluates predictions and returns a scalar metric areaUnderROC(larger is better but < 1).
#

evaluator = BinaryClassificationEvaluator()\
  .setLabelCol("label")\
  .setRawPredictionCol("rawPrediction")\
  .setMetricName("areaUnderROC")





+--------+------+-------+---------------+---------------+--------+-----------------+--------------+--------------------+-----+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|         province|confirmed_date|            features|label|
+--------+------+-------+---------------+---------------+--------+-----------------+--------------+--------------------+-----+
|isolated|female|      0|              1|              1|     100|            Seoul|        3/8/20|(85,[0,2,3,4,6,55...|  0.0|
|isolated|female|     10|              0|              0|     100|          Daejeon|       3/30/20|(85,[0,1,4,14,43]...|  0.0|
|isolated|female|     10|              0|              0|     100|          Gwangju|       3/31/20|(85,[0,1,4,17,25]...|  0.0|
|isolated|female|     10|              0|              0|     100|      Gyeonggi-do|       3/26/20|(85,[0,1,4,5,44],...|  0.0|
|isolated|female|     10|              0|              0|     100|            Seoul|       3/23/20|(85,[0,1,4,6

+--------+------+-------+---------------+---------------+--------+----------------+--------------+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|        province|confirmed_date|
+--------+------+-------+---------------+---------------+--------+----------------+--------------+
|isolated|female|      0|              1|              1|     100|           Seoul|        3/8/20|
|isolated|female|     10|              0|              0|     100|         Daejeon|       3/30/20|
|isolated|female|     10|              0|              0|     100|         Gwangju|       3/31/20|
|isolated|female|     10|              0|              0|     100|     Gyeonggi-do|       3/26/20|
|isolated|female|     10|              0|              0|     100|           Seoul|       3/23/20|
|isolated|female|     10|              0|              1|     100|Gyeongsangbuk-do|       2/26/20|
|isolated|female|     10|              1|              0|     100|     Gyeonggi-do|       2/29/20|
|isolated|

### Logistic Regression Confusion Matrix

In [27]:
accuracy = evaluator.evaluate(predictions)
#print("Test Error = %g " % (1.0 - accuracy))
#print("Accuracy = %g " % accuracy)


#
# Obtain data to build Confusion Matrix 
#

truePos = predictions.select("state").where("state = 'released' AND prediction = 1").count()
falsePos = predictions.select("state").where("state = 'isolated' AND prediction = 1").count()
trueNeg = predictions.select("state").where("state = 'isolated' AND prediction = 0").count()
falseNeg = predictions.select("state").where("state = 'released' AND prediction = 0").count()

print()


print("truePos = ", truePos )
print("falsePos = ", falsePos )
print("trueNeg = ", trueNeg )
print("falseNeg = ", falseNeg)

print() 
print("CONFUSION MATRIX:")
print("                   Predicted")
print("                Y=1    |   Y=0")
print(" Actual   y=1   " + str(truePos) + "        " + str(falseNeg))
print("          y=0   " + str(falsePos) + "         " + str(trueNeg)) 

#
# To get the parameters associated with the model, un-comment the "explainParams" below. 
# 
# print(lr.explainParams())


print("Test Error = %g " % ((falsePos+falseNeg)/(falsePos+falseNeg+truePos+trueNeg)))
print("Accuracy = %g " % ((truePos+trueNeg)/(falsePos+falseNeg+truePos+trueNeg)))




truePos =  190
falsePos =  10
trueNeg =  466
falseNeg =  5

CONFUSION MATRIX:
                   Predicted
                Y=1    |   Y=0
 Actual   y=1   190        5
          y=0   10         466
Test Error = 0.0223547 
Accuracy = 0.977645 


## DESCISION TREE - Test Data 
The actual execution of the Decision Tree model is relatively straightforward.   There is a little prep required for the data so that we will have a label and vectors **indexed**.  
* Similar to Logistic Regresion, the data is read in and the "label" and "features" columns are created. 
* The data is then converted into Indexes needed for the Decision Tree model to run 

In [28]:
from pyspark import SparkContext, SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import RFormula




labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(preparedDF)
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(preparedDF)
(trainingDataDT, testDataDT) = preparedDF.randomSplit([0.7, 0.3])

# print("AFTER INDEXING:")
#
# Create a decision tree classifier that will process the columns created above and run the model. 
# 
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
modelDT = pipeline.fit(trainingDataDT)

# Make predictions.
predictionsDT = modelDT.transform(testDataDT)

print()
print("PREDICTIONS DATA:")
#predictions.select('sex', 'Rev_Age', 'Patient_Contact', 'Have_Onset_Date', 'Duration', 'rawPrediction', 'prediction', 'probability').show()

testDFdt = predictionsDT.select("state", "sex", "Rev_Age", "Patient_Contact", "Have_Onset_Date", "Duration", "prediction")
testDFdt.show()


testDFdt.write.format("csv").mode('overwrite')\
.option("header","true")\
.save("decision_tree_dataframe.csv")





PREDICTIONS DATA:
+--------+------+-------+---------------+---------------+--------+----------+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|prediction|
+--------+------+-------+---------------+---------------+--------+----------+
|isolated|female|      0|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              1|              0|     100|       0.0|
|isolated|female|     10|              1|              1|     100|       0.0|
|isolated|female|     20|              0|              0|     100|       0.0|
|isolated|female|     20|              0|              0|     100|       0.0|
|isolated|female|     20|              0|    

### Decision Tree Confusion Matrix

In [29]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionsDT)
#print("Test Error = %g " % (1.0 - accuracy))
#print("Accuracy = %g " % accuracy)

truePosDT = predictionsDT.select("state").where("state = 'released' AND prediction = 1").count()
falsePosDT = predictionsDT.select("state").where("state = 'isolated' AND prediction = 1").count()
trueNegDT = predictionsDT.select("state").where("state = 'isolated' AND prediction = 0").count()
falseNegDT = predictionsDT.select("state").where("state = 'released' AND prediction = 0").count()


print()


print("truePos = ", truePosDT )
print("falsePos = ", falsePosDT )
print("trueNeg = ", trueNegDT )
print("falseNeg = ", falseNegDT)

print() 
print("CONFUSION MATRIX:")
print("                   Predicted")
print("                Y=1    |   Y=0")
print(" Actual   y=1   " + str(truePosDT) + "        " + str(falseNegDT))
print("          y=0   " + str(falsePosDT) + "         " + str(trueNegDT)) 


print("Test Error = %g " % ((falsePosDT+falseNegDT)/(falsePosDT+falseNegDT+truePosDT+trueNegDT)))
print("Accuracy = %g " % ((truePosDT+trueNegDT)/(falsePosDT+falseNegDT+truePosDT+trueNegDT)))





truePos =  194
falsePos =  4
trueNeg =  431
falseNeg =  25

CONFUSION MATRIX:
                   Predicted
                Y=1    |   Y=0
 Actual   y=1   194        25
          y=0   4         431
Test Error = 0.0443425 
Accuracy = 0.955657 


## Conclusion Part 1 Comparison to Project 2

I ran the exact same calculations as I did in Project 2 of Logistic Regression and Decision Tree. However, I add the province and the confirmed sick date to this dataset. Both the Logistic Regression and the Decision Tree for this dataset had a higher accuracy and lower error in predictions. In this data set the Logistic Regression was more accurate than the Decision Tree.

## Switch Response Variable

in this seciton we are going to switch the response variable from the 'state' to the 'sex' of the patient.

## Split Data Part 2

In [30]:

#
# This sets up the data to run through the model.   
# The columns "features" and "label" are added to the existing data set.
# These columns provide the data (vectors and label) needed for processing the data
# 

supervised = RFormula(formula="sex ~ .")
fittedRF = supervised.fit(covid)
preparedDF = fittedRF.transform(covid)
print()
print("DATA AFTER FITTING:")
preparedDF.show(30)

# 
# Split the data into a Training set (70%) and Test data (30%)
# Set up and run the logistic regression 
# 

trainingData, testData = preparedDF.randomSplit([0.7, 0.3], 5) 




DATA AFTER FITTING:
+--------+------+-------+---------------+---------------+--------+--------+--------------+--------------------+-----+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|province|confirmed_date|            features|label|
+--------+------+-------+---------------+---------------+--------+--------+--------------+--------------------+-----+
|released|  male|     50|              0|              1|      13|   Seoul|       1/23/20|(85,[1,3,4,6,82],...|  1.0|
|released|  male|     30|              0|              0|      32|   Seoul|       1/30/20|(85,[1,4,6,75],[3...|  1.0|
|released|  male|     50|              1|              0|      20|   Seoul|       1/30/20|(85,[1,2,4,6,75],...|  1.0|
|released|  male|     20|              0|              1|      16|   Seoul|       1/30/20|(85,[1,3,4,6,75],...|  1.0|
|released|female|     20|              1|              0|      24|   Seoul|       1/31/20|(85,[1,2,4,6,72],...|  0.0|
|released|female|     50|          

## LOGISTIC REGRESSION Part 2

In [31]:
lr = LogisticRegression()
lrModel = lr.fit(trainingData)
displaySummary = lrModel.summary

#
# To get the parameters associated with the model, un-comment the "explainParams" below. 
# 
# print(lr.explainParams())

#
# Get the resulting coefficients and Y intercept 
# 

print ("Coefficients = " + str(lrModel.coefficients))
print ("Y intercept = " + str(lrModel.intercept)) 

trainingSummary = lrModel.summary

#
# Obtain the ROC using the "AREA UNDER THE CURVE"
# 
print("")
print("areaUnderROC: " + str(displaySummary.areaUnderROC))
print("")
print('ROC Values')
displaySummary.roc.show()

Coefficients = [0.48813715612913067,-0.01156125684120185,0.06827504191074643,0.040397943773924985,-0.004670340175240567,1.109856938984425,1.116841579428748,0.9934568922486144,0.6174768563200163,1.5665811476137084,1.586331687847308,0.9088678448524021,1.800289575739157,1.0580580311480192,1.1295938456376662,0.7036641221901972,1.3035907296482518,1.405249804956088,2.404436080994301,1.0243014046426577,-0.26611154623212396,7.19711170318372,7.799386756409984,8.179455558602875,8.553973396524936,7.633728696608021,8.462282851575846,8.336203490763504,8.39416535188146,7.6380992157770775,7.4264988350008965,8.188435434759453,7.8627815244609955,7.956624131911796,7.303031805819307,8.301209067916739,7.564513860144862,7.266512543278602,7.835665139507113,7.650338762059154,7.833607707649667,8.68919842510396,8.492291293559147,7.99358441447382,7.726064898079559,8.371567407155949,8.097855496290128,8.554228460346438,7.549061337619632,7.542781105409881,8.637033463527999,8.079443434842627,8.346302465946868,8.774

### LOGISTIC REGRESSION Cross Validation with TEST Data Part 2

Now we will run the Logistic Regression with the Test data set and review the evaluation data. 

In [32]:
# 
# RUN THE TEST DATA 
# 

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import confusion_matrix

testData.show(100)
predictions = lrModel.transform(testData)
testDF = predictions

#
# Create the testDF for output to a CSV file for later comparison. 
#
testDF = testDF.select("state", "sex", "Rev_Age", "Patient_Contact", "Have_Onset_Date", "Duration", "province", 'confirmed_date')

testDF.show()


#
# Create an Evaluator for binary classification, which expects two input columns: rawPrediction and label.
# Evaluates predictions and returns a scalar metric areaUnderROC(larger is better but < 1).
#

evaluator = BinaryClassificationEvaluator()\
  .setLabelCol("label")\
  .setRawPredictionCol("rawPrediction")\
  .setMetricName("areaUnderROC")





+--------+------+-------+---------------+---------------+--------+-----------------+--------------+--------------------+-----+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|         province|confirmed_date|            features|label|
+--------+------+-------+---------------+---------------+--------+-----------------+--------------+--------------------+-----+
|isolated|female|      0|              1|              1|     100|            Seoul|        3/8/20|(85,[0,2,3,4,6,55...|  0.0|
|isolated|female|     10|              0|              0|     100|          Daejeon|       3/30/20|(85,[0,1,4,14,43]...|  0.0|
|isolated|female|     10|              0|              0|     100|          Gwangju|       3/31/20|(85,[0,1,4,17,25]...|  0.0|
|isolated|female|     10|              0|              0|     100|      Gyeonggi-do|       3/26/20|(85,[0,1,4,5,44],...|  0.0|
|isolated|female|     10|              0|              0|     100|            Seoul|       3/23/20|(85,[0,1,4,6

### Logistic Regression Confusion Matrix Part 2

In [33]:
accuracy = evaluator.evaluate(predictions)
#print("Test Error = %g " % (1.0 - accuracy))
#print("Accuracy = %g " % accuracy)


#
# Obtain data to build Confusion Matrix 
#

truePos = predictions.select("state").where("state = 'released' AND prediction = 1").count()
falsePos = predictions.select("state").where("state = 'isolated' AND prediction = 1").count()
trueNeg = predictions.select("state").where("state = 'isolated' AND prediction = 0").count()
falseNeg = predictions.select("state").where("state = 'released' AND prediction = 0").count()

print()


print("truePos = ", truePos )
print("falsePos = ", falsePos )
print("trueNeg = ", trueNeg )
print("falseNeg = ", falseNeg)

print() 
print("CONFUSION MATRIX:")
print("                   Predicted")
print("                Y=1    |   Y=0")
print(" Actual   y=1   " + str(truePos) + "        " + str(falseNeg))
print("          y=0   " + str(falsePos) + "         " + str(trueNeg)) 

#
# To get the parameters associated with the model, un-comment the "explainParams" below. 
# 
# print(lr.explainParams())


print("Test Error = %g " % ((falsePos+falseNeg)/(falsePos+falseNeg+truePos+trueNeg)))
print("Accuracy = %g " % ((truePos+trueNeg)/(falsePos+falseNeg+truePos+trueNeg)))




truePos =  72
falsePos =  160
trueNeg =  316
falseNeg =  123

CONFUSION MATRIX:
                   Predicted
                Y=1    |   Y=0
 Actual   y=1   72        123
          y=0   160         316
Test Error = 0.421759 
Accuracy = 0.578241 


## DESCISION TREE - Test Data Part 2
The actual execution of the Decision Tree model is relatively straightforward.   There is a little prep required for the data so that we will have a label and vectors **indexed**.  
* Similar to Logistic Regresion, the data is read in and the "label" and "features" columns are created. 
* The data is then converted into Indexes needed for the Decision Tree model to run 

In [34]:
from pyspark import SparkContext, SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import RFormula




labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(preparedDF)
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(preparedDF)
(trainingDataDT, testDataDT) = preparedDF.randomSplit([0.7, 0.3])

# print("AFTER INDEXING:")
#
# Create a decision tree classifier that will process the columns created above and run the model. 
# 
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
modelDT = pipeline.fit(trainingDataDT)

# Make predictions.
predictionsDT = modelDT.transform(testDataDT)

print()
print("PREDICTIONS DATA:")
#predictions.select('sex', 'Rev_Age', 'Patient_Contact', 'Have_Onset_Date', 'Duration', 'rawPrediction', 'prediction', 'probability').show()

testDFdt = predictionsDT.select("state", "sex", "Rev_Age", "Patient_Contact", "Have_Onset_Date", "Duration", "prediction")
testDFdt.show()



testDFdt.write.format("csv").mode('overwrite')\
.option("header","true")\
.save("decision_tree_dataframe_part2.csv")





PREDICTIONS DATA:
+--------+------+-------+---------------+---------------+--------+----------+
|   state|   sex|Rev_Age|Patient_Contact|Have_Onset_Date|Duration|prediction|
+--------+------+-------+---------------+---------------+--------+----------+
|isolated|female|      0|              1|              0|     100|       1.0|
|isolated|female|      0|              1|              0|     100|       1.0|
|isolated|female|      0|              1|              0|     100|       1.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              0|              0|     100|       0.0|
|isolated|female|     10|              1|              0|     100|       1.0|
|isolated|female|     10|              1|              0|     100|       1.0|
|isolated|female|     10|              1|    

### Decision Tree Confusion Matrix Part 2

In [35]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionsDT)
#print("Test Error = %g " % (1.0 - accuracy))
#print("Accuracy = %g " % accuracy)

truePosDT = predictionsDT.select("state").where("state = 'released' AND prediction = 1").count()
falsePosDT = predictionsDT.select("state").where("state = 'isolated' AND prediction = 1").count()
trueNegDT = predictionsDT.select("state").where("state = 'isolated' AND prediction = 0").count()
falseNegDT = predictionsDT.select("state").where("state = 'released' AND prediction = 0").count()


print()


print("truePos = ", truePosDT )
print("falsePos = ", falsePosDT )
print("trueNeg = ", trueNegDT )
print("falseNeg = ", falseNegDT)

print() 
print("CONFUSION MATRIX:")
print("                   Predicted")
print("                Y=1    |   Y=0")
print(" Actual   y=1   " + str(truePosDT) + "        " + str(falseNegDT))
print("          y=0   " + str(falsePosDT) + "         " + str(trueNegDT)) 


print("Test Error = %g " % ((falsePosDT+falseNegDT)/(falsePosDT+falseNegDT+truePosDT+trueNegDT)))
print("Accuracy = %g " % ((truePosDT+trueNegDT)/(falsePosDT+falseNegDT+truePosDT+trueNegDT)))





truePos =  89
falsePos =  182
trueNeg =  271
falseNeg =  122

CONFUSION MATRIX:
                   Predicted
                Y=1    |   Y=0
 Actual   y=1   89        122
          y=0   182         271
Test Error = 0.457831 
Accuracy = 0.542169 


## Conclusion Part 2

I ran the Logistic Regression and the Decision Tree multiple times for part 2 of this project where I switched the response variable to 'sex' variable. I had both situations where Logistic Regression was better and where Decision Tree was better. This leads to the conclusion that there is a very different outcome depending on the test data. In addition the accuracy of these for all the times we tried was between 50% and 65%. This means that you relatively get the same results with random guessing as there is only possible responses for 'sex'. With these predictors, there is not a strong prediciton method to predict the 'sex' variable. To predict the 'sex' variable I believe that the predictors should be more quantitative predictors such as heart rate, blood pressure, breathing rate, oxygen saturation, transmisiblity, and servity of symptoms.