# Logistic Regression Model

In [11]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LR').getOrCreate()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorSlicer, VectorAssembler, ChiSqSelector, VectorIndexer, UnivariateFeatureSelector, VarianceThresholdSelector
from pyspark.sql.functions import *
import numpy as np
from pyspark.sql.types import IntegerType
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import RegressionMetrics

In [7]:
df = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/medications.csv').select('PATIENT','Code', 'Description')
deathDf = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/patients.csv').select('Id', 'DEATHDATE')
deadSet = df.join(deathDf, (df.PATIENT == deathDf.Id)).na.drop().drop('Id', 'Code')
labels = spark.read.option("header",True).csv('../FeatureSelection/dfCovid_DeceasedCovid.csv').select('PATIENT', 'covid-19', 'deceased & covid-19')

merged = df.join(deathDf, (df.PATIENT == deathDf.Id), 'left').drop( 'Id')

merged = merged.withColumn('deceased', when(col('DEATHDATE').isNotNull(), 1)).na.fill(0)
merged = merged.join(labels, ('PATIENT'), 'left').dropDuplicates()

                                                                                

In [8]:
groupedDf = merged.groupBy("PATIENT", 'Code').pivot("Code").agg(count("Code").alias("count")).na.fill(0)
merged =merged.select('PATIENT', 'deceased', 'covid-19', 'deceased & covid-19')
finalDf = groupedDf.join(merged, ['PATIENT'], 'left')
cols = list(set(finalDf.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
assembler = VectorAssembler().setInputCols(cols).setOutputCol('features')
finalDf = finalDf.withColumn("covid-19", finalDf["covid-19"].cast(IntegerType())).withColumn("deceased & covid-19", finalDf["deceased & covid-19"].cast(IntegerType()))
df = assembler.transform(finalDf)
df.printSchema()
df = df.dropna()

root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 1000126: long (nullable = true)
 |-- 1014676: long (nullable = true)
 |-- 1014678: long (nullable = true)
 |-- 1043400: long (nullable = true)
 |-- 1049221: long (nullable = true)
 |-- 1049630: long (nullable = true)
 |-- 1049635: long (nullable = true)
 |-- 105078: long (nullable = true)
 |-- 105585: long (nullable = true)
 |-- 106258: long (nullable = true)
 |-- 106892: long (nullable = true)
 |-- 1091392: long (nullable = true)
 |-- 1094107: long (nullable = true)
 |-- 1100184: long (nullable = true)
 |-- 1114085: long (nullable = true)
 |-- 1116758: long (nullable = true)
 |-- 1190795: long (nullable = true)
 |-- 1234995: long (nullable = true)
 |-- 1359133: long (nullable = true)
 |-- 1363309: long (nullable = true)
 |-- 1367439: long (nullable = true)
 |-- 141918: long (nullable = true)
 |-- 1534809: long (nullable = true)
 |-- 1599803: long (nullable = true)
 |-- 1601380: long (nullable = true)


## Chi-Sq Selector

Deceased

In [4]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased")
chiResult = selector.fit(df).transform(df)

23/04/03 15:09:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
# stratified split
class0 = df.filter(df["deceased"]==0)
class1 = df.filter(df["deceased"]==1)
print("Class 0 (deceased= 0): ", class0.count())
print("Class 1 (deceased= 1): ", class1.count())
(train, test) = chiResult.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'features')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)


                                                                                

Class 0 (deceased= 0):  181515


                                                                                

Class 1 (deceased= 1):  119930


23/04/03 15:10:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (169,[],[])
Intercept:  -0.41285088322284175


                                                                                

In [6]:
(train, test) = chiResult.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'selectedFeatures')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)


23/04/03 15:10:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:10:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:11:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (10,[],[])
Intercept:  -0.4150127358695301


                                                                                

Covid-19

In [15]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="covid-19")
chiResult = selector.fit(df).transform(df)

                                                                                

In [16]:
(train, test) = chiResult.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'features')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)


23/04/03 15:17:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:17:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:18:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (169,[],[])
Intercept:  -0.41288964306645576


                                                                                

In [17]:
(train, test) = chiResult.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'selectedFeatures')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)


23/04/03 15:18:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:18:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:19:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (10,[],[])
Intercept:  -0.413241487572899


                                                                                

Deceased & Covid-19

In [18]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased & covid-19")
chiResult = selector.fit(df).transform(df)

                                                                                

In [19]:
(train, test) = chiResult.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'features')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)


23/04/03 15:20:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:20:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:21:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (169,[],[])
Intercept:  -0.415840901103074


                                                                                

In [20]:
(train, test) = chiResult.randomSplit([0.8, 0.2])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'selectedFeatures')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)


23/04/03 15:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:21:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:22:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (10,[],[])
Intercept:  -0.4112559689445507


                                                                                

## Univariate Selector

Deceased

In [21]:
selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="deceased", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

                                                                                

UnivariateFeatureSelector output with top 10 features selected using f_classif


[Stage 366:>                                                        (0 + 1) / 1]

+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



                                                                                

In [22]:
(train, test) = uniResult.randomSplit([0.8, 0.2])
chiResult = selector.fit(df).transform(df)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'features')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)

23/04/03 15:24:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:24:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:25:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (169,[],[])
Intercept:  -0.4129213254738458




In [23]:
(train, test) = uniResult.randomSplit([0.8, 0.2])
chiResult = selector.fit(df).transform(df)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased', featuresCol = 'selectedFeatures')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)

23/04/03 15:26:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:26:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:26:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.60
Coefficients:  (10,[],[])
Intercept:  -0.41626153246248243


                                                                                

Covid-19

In [24]:
selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="covid-19", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

                                                                                

UnivariateFeatureSelector output with top 10 features selected using f_classif


[Stage 518:>                                                        (0 + 1) / 1]

+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



                                                                                

In [25]:
(train, test) = uniResult.randomSplit([0.8, 0.2])
chiResult = selector.fit(df).transform(df)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'covid-19', featuresCol = 'features')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)

23/04/03 15:27:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:27:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:28:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.61
Coefficients:  (169,[],[])
Intercept:  -0.44369677052768647


                                                                                

In [26]:
(train, test) = uniResult.randomSplit([0.8, 0.2])
chiResult = selector.fit(df).transform(df)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'covid-19', featuresCol = 'selectedFeatures')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)

23/04/03 15:28:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:28:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:29:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.61
Coefficients:  (10,[],[])
Intercept:  -0.4463054165039705


                                                                                

Deceased & Covid-19

In [27]:
selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="deceased & covid-19", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

                                                                                

UnivariateFeatureSelector output with top 10 features selected using f_classif


[Stage 670:>                                                        (0 + 1) / 1]

+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[141],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
|(169,[107],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[30],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
| (169,[31],[2.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



                                                                                

In [28]:
(train, test) = uniResult.randomSplit([0.8, 0.2])
chiResult = selector.fit(df).transform(df)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased & covid-19', featuresCol = 'features')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)

23/04/03 15:30:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:30:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:30:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Accuracy: 0.90
Coefficients:  (169,[],[])
Intercept:  -2.1436520563432877


                                                                                

In [None]:
(train, test) = uniResult.randomSplit([0.8, 0.2])
chiResult = selector.fit(df).transform(df)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'deceased & covid-19', featuresCol = 'selectedFeatures')
lrModel = lr.fit(train)
result = lrModel.transform(test)

print('Accuracy: {:0.2f}'.format(lrModel.evaluate(test).accuracy))
print("Coefficients: ", lrModel.coefficients)
print("Intercept: ", lrModel.intercept)

23/04/03 15:31:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/03 15:31:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
[Stage 791:>                                                      (0 + 12) / 12]