To install PySpark using pip

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=1cab592ed10ca41c2283cce55d7643f43e2f74aee143ee40f3300a236e80bca5
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession #PySpark SQL library
from pyspark.ml.feature import StringIndexer, VectorAssembler #PySpark MLlib library
from pyspark.ml.classification import RandomForestClassifier #
from pyspark.ml import Pipeline # imports the Pipeline class from the PySpark MLlib library.



In [None]:
#initialize  SparkSession
spark = SparkSession.builder \
        .appName("RandomForestExample") \
        .getOrCreate()


In [None]:

df = spark.read.csv('diabetes_Rf.csv', header=True, inferSchema=True)
df.show()


+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
| false|80.0|           0|            1|          false|25.19|        6.6|                140|       0|
| false|54.0|           0|            0|          false|27.32|        6.6|                 80|       0|
| false|28.0|           0|            0|          false|27.32|        5.7|                158|       0|
| false|36.0|           0|            0|          false|23.45|        5.0|                155|       0|
| false|76.0|           1|            1|          false|20.14|        4.8|                155|       0|
| false|20.0|           0|            0|          false|27.32|        6.6|                 85|       0|
| false|44.0|           0|            0|          false|19.31|  

In [None]:
df.select("diabetes").show()

+--------+
|diabetes|
+--------+
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       1|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
|       0|
+--------+
only showing top 20 rows



In [None]:
df.count()

50889

In [None]:
len(df.columns)

9

In [None]:
df.printSchema()

root
 |-- gender: boolean (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: boolean (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)



In [None]:
df.describe().show()

+-------+------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+
|summary|               age|       hypertension|      heart_disease|               bmi|       HbA1c_level|blood_glucose_level|          diabetes|
+-------+------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+
|  count|             50889|              50889|              50889|             50889|             50889|              50889|             50889|
|   mean| 41.85940380042824|0.07516359134587042|0.03910471811196919|27.343307001512525| 5.526921338600055|  137.8510287095443|0.0865216451492464|
| stddev|22.483728941749536| 0.2636577173189958|0.19384601499697432| 6.660743065125988|1.0738875687972904| 40.993845815707076| 0.281135560182871|
|    min|              0.08|                  0|                  0|             10.01|               3.5|                 8

In [None]:
df.head(5)

[Row(gender=False, age=80.0, hypertension=0, heart_disease=1, smoking_history=False, bmi=25.19, HbA1c_level=6.6, blood_glucose_level=140, diabetes=0),
 Row(gender=False, age=54.0, hypertension=0, heart_disease=0, smoking_history=False, bmi=27.32, HbA1c_level=6.6, blood_glucose_level=80, diabetes=0),
 Row(gender=False, age=28.0, hypertension=0, heart_disease=0, smoking_history=False, bmi=27.32, HbA1c_level=5.7, blood_glucose_level=158, diabetes=0),
 Row(gender=False, age=36.0, hypertension=0, heart_disease=0, smoking_history=False, bmi=23.45, HbA1c_level=5.0, blood_glucose_level=155, diabetes=0),
 Row(gender=False, age=76.0, hypertension=1, heart_disease=1, smoking_history=False, bmi=20.14, HbA1c_level=4.8, blood_glucose_level=155, diabetes=0)]

In [None]:
df.groupBy("diabetes").count().show()

+--------+-----+
|diabetes|count|
+--------+-----+
|       1| 4403|
|       0|46486|
+--------+-----+



In [None]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|67.0|  533|
| 8.0|  431|
|70.0|  453|
|69.0|  487|
| 7.0|  467|
|0.16|   32|
|1.16|   54|
|1.08|   48|
|1.72|   50|
|49.0|  790|
| 1.4|   51|
|0.72|   56|
|29.0|  694|
|75.0|  378|
|64.0|  542|
|0.24|   44|
|47.0|  803|
|42.0|  724|
|44.0|  737|
|62.0|  726|
+----+-----+
only showing top 20 rows



In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
df.columns

['gender',
 'age',
 'hypertension',
 'heart_disease',
 'smoking_history',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level',
 'diabetes']

In [None]:
assembler = VectorAssembler(inputCols= ['gender','age','hypertension','heart_disease','smoking_history','bmi','HbA1c_level','blood_glucose_level'],
                            outputCol = "features")

In [None]:
assembler

VectorAssembler_2113a7d9e9df

In [None]:
output = assembler.transform(df)

In [None]:
output.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+--------------------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|            features|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+--------------------+
| false|80.0|           0|            1|          false|25.19|        6.6|                140|       0|[0.0,80.0,0.0,1.0...|
| false|54.0|           0|            0|          false|27.32|        6.6|                 80|       0|(8,[1,5,6,7],[54....|
| false|28.0|           0|            0|          false|27.32|        5.7|                158|       0|(8,[1,5,6,7],[28....|
| false|36.0|           0|            0|          false|23.45|        5.0|                155|       0|(8,[1,5,6,7],[36....|
| false|76.0|           1|            1|          false|20.14|        4.8|                155|       0|[0.0,76.0,1.0,1.0...|


In [None]:
output.select(["features", "diabetes"]).show(truncate=False)

+--------------------------------------+--------+
|features                              |diabetes|
+--------------------------------------+--------+
|[0.0,80.0,0.0,1.0,0.0,25.19,6.6,140.0]|0       |
|(8,[1,5,6,7],[54.0,27.32,6.6,80.0])   |0       |
|(8,[1,5,6,7],[28.0,27.32,5.7,158.0])  |0       |
|(8,[1,5,6,7],[36.0,23.45,5.0,155.0])  |0       |
|[0.0,76.0,1.0,1.0,0.0,20.14,4.8,155.0]|0       |
|(8,[1,5,6,7],[20.0,27.32,6.6,85.0])   |0       |
|(8,[1,5,6,7],[44.0,19.31,6.5,140.0])  |1       |
|(8,[1,5,6,7],[79.0,23.86,5.7,85.0])   |0       |
|(8,[1,5,6,7],[42.0,33.64,4.8,145.0])  |0       |
|(8,[1,5,6,7],[32.0,27.32,5.0,100.0])  |0       |
|(8,[1,5,6,7],[53.0,27.32,6.1,85.0])   |0       |
|(8,[1,5,6,7],[54.0,54.7,6.0,100.0])   |0       |
|(8,[1,5,6,7],[78.0,36.05,5.0,130.0])  |0       |
|(8,[1,5,6,7],[67.0,25.69,5.8,200.0])  |0       |
|(8,[1,5,6,7],[76.0,27.32,5.0,160.0])  |0       |
|(8,[1,5,6,7],[78.0,27.32,6.6,126.0])  |0       |
|(8,[1,5,6,7],[15.0,30.36,6.1,200.0])  |0       |


In [None]:
model_df = output.select(["features", "diabetes"])

In [None]:
model_df.show()

+--------------------+--------+
|            features|diabetes|
+--------------------+--------+
|[0.0,80.0,0.0,1.0...|       0|
|(8,[1,5,6,7],[54....|       0|
|(8,[1,5,6,7],[28....|       0|
|(8,[1,5,6,7],[36....|       0|
|[0.0,76.0,1.0,1.0...|       0|
|(8,[1,5,6,7],[20....|       0|
|(8,[1,5,6,7],[44....|       1|
|(8,[1,5,6,7],[79....|       0|
|(8,[1,5,6,7],[42....|       0|
|(8,[1,5,6,7],[32....|       0|
|(8,[1,5,6,7],[53....|       0|
|(8,[1,5,6,7],[54....|       0|
|(8,[1,5,6,7],[78....|       0|
|(8,[1,5,6,7],[67....|       0|
|(8,[1,5,6,7],[76....|       0|
|(8,[1,5,6,7],[78....|       0|
|(8,[1,5,6,7],[15....|       0|
|(8,[1,5,6,7],[42....|       0|
|(8,[1,5,6,7],[42....|       0|
|(8,[1,5,6,7],[37....|       0|
+--------------------+--------+
only showing top 20 rows



In [None]:
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [None]:
print(training_df.count())
print(test_df.count())

35540
15349


In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
rf_classifier = RandomForestClassifier(labelCol="diabetes",
                                       numTrees = 500).fit(training_df)

In [None]:
rf_predictions = rf_classifier.transform(test_df)

In [None]:
rf_predictions.show()

+--------------------+--------+--------------------+--------------------+----------+
|            features|diabetes|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[0.0...|       0|[490.115565999900...|[0.98023113199980...|       0.0|
|(8,[1,5,6,7],[0.0...|       0|[489.591247574864...|[0.97918249514972...|       0.0|
|(8,[1,5,6,7],[0.0...|       0|[491.869296639797...|[0.98373859327959...|       0.0|
|(8,[1,5,6,7],[0.0...|       0|[491.213180011897...|[0.98242636002379...|       0.0|
|(8,[1,5,6,7],[0.1...|       0|[490.115565999900...|[0.98023113199980...|       0.0|
|(8,[1,5,6,7],[0.1...|       0|[490.115565999900...|[0.98023113199980...|       0.0|
|(8,[1,5,6,7],[0.1...|       0|[489.592859076552...|[0.97918571815310...|       0.0|
|(8,[1,5,6,7],[0.1...|       0|[490.115565999900...|[0.98023113199980...|       0.0|
|(8,[1,5,6,7],[0.1...|       0|[489.591247574864...|[0.9791824951

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
rf_auc = BinaryClassificationEvaluator(labelCol="diabetes").evaluate(rf_predictions)

In [None]:
rf_auc

0.9518381566754585

In [None]:
print("Area Under ROC:", rf_auc)

Area Under ROC: 0.9528426166693317
