<a href="https://colab.research.google.com/github/Harikrishnan-M-Ariki/Data-Science/blob/main/diabetes_prediction_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [4]:
!wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz

In [None]:
!tar -xvf spark-3.1.1-bin-hadoop3.2.tgz

In [6]:
!pip install -q findspark

In [7]:
import os
os.environ["JAVA_HOME"]='/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME']='/content/spark-3.1.1-bin-hadoop3.2'

In [8]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [9]:
spark=SparkSession.builder.master('local[*]').getOrCreate()

In [10]:
!git clone https://github.com/education454/diabetes_dataset

fatal: destination path 'diabetes_dataset' already exists and is not an empty directory.


In [11]:
data=spark.read.csv('/content/diabetes_dataset/diabetes.csv',header=True,inferSchema=True)

In [12]:
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|      0|33.6|                   0.127| 47|      1|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|
|          0|    145|            0|            0|      0|44.2|                    0.63| 31|      1|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|
|          0|    173|           78|           32|    265|46.5|                   1.159| 58|      0|
|          4|     99|           72|           17|      0|25.6|                   0.294| 28|      0|


In [13]:
data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [14]:
print(data.count())

2000


In [15]:
print(len(data.columns))

9


In [16]:
data.groupby('Outcome').count().show()

+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  684|
|      0| 1316|
+-------+-----+



In [17]:
data.describe().show()

+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+
|summary|      Pregnancies|           Glucose|     BloodPressure|    SkinThickness|          Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+
|  count|             2000|              2000|              2000|             2000|             2000|              2000|                    2000|              2000|              2000|
|   mean|           3.7035|          121.1825|           69.1455|           20.935|           80.254|32.192999999999984|     0.47092999999999974|           33.0905|             0.342|
| stddev|3.306063032730656|32.068635649902916|19.188314815604098|16.103242909926

Cleaning Data

In [18]:
#Checking null values:
for col in data.columns:
  print(col+' : '+str(data[data[col].isNull()].count()))

Pregnancies : 0
Glucose : 0
BloodPressure : 0
SkinThickness : 0
Insulin : 0
BMI : 0
DiabetesPedigreeFunction : 0
Age : 0
Outcome : 0


In [19]:
#Printing zeros count
for column in data.columns:
  print(column+' : '+str(data[data[column]==0].count()))

Pregnancies : 301
Glucose : 13
BloodPressure : 90
SkinThickness : 573
Insulin : 956
BMI : 28
DiabetesPedigreeFunction : 0
Age : 0
Outcome : 1316


In [20]:
from pyspark.sql.functions import *
for column in data.columns:
  if column!='Outcome':
    data=data.withColumn(column,when(data[column]==0,int(data.agg({column:'mean'}).first()[0])).otherwise(data[column]))

In [21]:
for column in data.columns:
  print(column+' : '+str(data[data[column]==0].count()))

Pregnancies : 0
Glucose : 0
BloodPressure : 0
SkinThickness : 0
Insulin : 0
BMI : 0
DiabetesPedigreeFunction : 0
Age : 0
Outcome : 1316


In [22]:
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|     80|33.6|                   0.127| 47|      1|
|          3|     84|           82|           31|    125|38.2|                   0.233| 23|      0|
|          3|    145|           69|           20|     80|44.2|                    0.63| 31|      1|
|          3|    135|           68|           42|    250|42.3|                   0.365| 24|      1|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|
|          3|    173|           78|           32|    265|46.5|                   1.159| 58|      0|
|          4|     99|           72|           17|     80|25.6|                   0.294| 28|      0|


In [23]:
for column in data.columns:
  print("Correlation to outcome for {} is {}".format(column,data.stat.corr('Outcome',column)))

Correlation to outcome for Pregnancies is 0.24790480177711924
Correlation to outcome for Glucose is 0.48796646527321064
Correlation to outcome for BloodPressure is 0.17171333286446713
Correlation to outcome for SkinThickness is 0.1659010662889893
Correlation to outcome for Insulin is 0.1711763270226193
Correlation to outcome for BMI is 0.2827927569760082
Correlation to outcome for DiabetesPedigreeFunction is 0.1554590791569403
Correlation to outcome for Age is 0.23650924717620253
Correlation to outcome for Outcome is 1.0


In [24]:
data.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [26]:
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age'],outputCol='features')

In [27]:
output_data=assembler.transform(data)

In [28]:
output_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
from pyspark.ml.classification import LogisticRegression

final_data=output_data.select('features','Outcome')

In [31]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Outcome: integer (nullable = true)



In [32]:
train,test =final_data.randomSplit([0.7,0.3])
models=LogisticRegression(labelCol='Outcome')
model=models.fit(train)

In [34]:
summary=model.summary

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [37]:
predictions=model.evaluate(test)

In [38]:
predictions.predictions.show()

+--------------------+-------+--------------------+--------------------+----------+
|            features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[1.0,71.0,62.0,20...|      0|[3.98067794555235...|[0.98166931269542...|       0.0|
|[1.0,71.0,62.0,20...|      0|[3.98067794555235...|[0.98166931269542...|       0.0|
|[1.0,71.0,78.0,50...|      0|[3.12168541185001...|[0.95777843697371...|       0.0|
|[1.0,73.0,50.0,10...|      0|[3.93969144639124...|[0.98091702778460...|       0.0|
|[1.0,79.0,75.0,30...|      0|[3.02696368767885...|[0.95377749843346...|       0.0|
|[1.0,81.0,72.0,18...|      0|[3.36104883554068...|[0.96646478674666...|       0.0|
|[1.0,81.0,72.0,18...|      0|[3.36104883554068...|[0.96646478674666...|       0.0|
|[1.0,81.0,74.0,41...|      0|[1.36710287347378...|[0.79691167590513...|       0.0|
|[1.0,82.0,64.0,13...|      0|[3.68092387662958...|[0.97541973226758...|    

In [39]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='Outcome')

In [40]:
evaluator.evaluate(model.transform(test))

0.8609303352688253

In [41]:
model.save('model')

In [42]:
from pyspark.ml.classification import LogisticRegressionModel

In [43]:
model=LogisticRegressionModel.load('model')