In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('diabetes').getOrCreate()

In [3]:
df = spark.read.csv('diabetes.csv', inferSchema = True, header = True)

In [4]:
df.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [5]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
df.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [8]:
assembler = VectorAssembler(inputCols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction','Age',],
                            outputCol = 'features')

In [9]:
output = assembler.transform(df)

In [10]:
final_data = output.select('features', 'outcome')

In [12]:
final_data.show(5)

+--------------------+-------+
|            features|outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
+--------------------+-------+
only showing top 5 rows



### Model Building

In [13]:
from pyspark.ml.classification import LogisticRegression

In [14]:
db_log = LogisticRegression(featuresCol = 'features', labelCol = 'outcome')

In [15]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [16]:
model_fit = db_log.fit(train_data)

In [17]:
log_summary = model_fit.summary

In [18]:
log_summary.predictions.show(10)

+--------------------+-------+--------------------+--------------------+----------+
|            features|outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[2.0...|    0.0|[4.93139364815669...|[0.99283526478231...|       0.0|
|(8,[0,1,6,7],[2.0...|    0.0|[4.4736958752674,...|[0.98872352310470...|       0.0|
|(8,[0,1,6,7],[6.0...|    0.0|[2.92420493993301...|[0.94903008484216...|       0.0|
|(8,[1,5,6,7],[73....|    0.0|[3.17813276079927...|[0.96000303081928...|       0.0|
|(8,[1,5,6,7],[99....|    0.0|[2.02207462032959...|[0.88309535894919...|       0.0|
|(8,[1,5,6,7],[117...|    0.0|[-0.1294694532908...|[0.46767777374580...|       1.0|
|(8,[1,5,6,7],[119...|    1.0|[0.70799767837921...|[0.66995856906289...|       0.0|
|(8,[1,5,6,7],[141...|    1.0|[-1.0554447694163...|[0.25818092709584...|       1.0|
|(8,[1,5,6,7],[145...|    1.0|[-1.6389990733694...|[0.16260130485125...|    

### Model Evaluation

In [19]:
pred_and_labels = model_fit.evaluate(test_data)

In [21]:
pred_and_labels.predictions.show(5)

+--------------------+-------+--------------------+--------------------+----------+
|            features|outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[3.0...|      0|[4.55607953777347...|[0.98960601409427...|       0.0|
|(8,[0,1,6,7],[7.0...|      0|[3.08234771505387...|[0.95615870448872...|       0.0|
|(8,[0,1,6,7],[10....|      1|[2.30027095682968...|[0.90889947702644...|       0.0|
|(8,[1,5,6,7],[131...|      1|[-0.7648783878142...|[0.31758805373043...|       1.0|
|(8,[1,5,6,7],[138...|      1|[-0.7913085571019...|[0.31188776591560...|       1.0|
+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
model_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'outcome')

In [24]:
final_eval = model_eval.evaluate(pred_and_labels.predictions)

In [25]:
final_eval

0.7188597565956056

In [29]:
print(f"DataFrame shape: {((df.count(), len(df.columns)))}")

DataFrame shape: (768, 9)
