<a href="https://colab.research.google.com/github/Git-krishn/DiabetesPredicitonMLIB-/blob/main/Diabetes_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK 1 : Install Dependencies & Run Spark Session

In [None]:
#install pyspark
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=7b337958467816987a5a562b1c87890f7c21b5b5d4866fa67755a34c12c76537
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark").getOrCreate()

# TASK 2: Clone & Explore dataset

In [None]:
#clone the diabetes dataset from the github repository
! git clone https://github.com/education454/diabetes_dataset

Cloning into 'diabetes_dataset'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (6/6), 13.02 KiB | 13.02 MiB/s, done.


In [None]:
#check if the dataset exists
! ls diabetes_dataset

diabetes.csv  new_test.csv


In [29]:
#create spark dataframe
df = spark.read.csv("/content/diabetes_dataset",header = True, inferSchema= True)

In [None]:
#display the dataframe
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|      0|33.6|                   0.127| 47|      1|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|
|          0|    145|            0|            0|      0|44.2|                    0.63| 31|      1|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|
|          0|    173|           78|           32|    265|46.5|                   1.159| 58|      0|
|          4|     99|           72|           17|      0|25.6|                   0.294| 28|      0|


In [30]:
#print the schema
df.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: integer (nullable = true)



In [46]:
#casting the column values as integers

df = df\
  .select(
    col('Pregnancies').cast('integer'),
    col('Glucose').cast('integer'),
    col('BloodPressure').cast('integer'),
    col('SkinThickness').cast('integer'),
    col('Insulin').cast('integer'),
    col('BMI').cast('double'),
    col('DiabetesPedigreeFunction').cast('double'),
    col('Age').cast('integer'),
    col('Outcome').cast('integer'),

  )

In [48]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [None]:
#count the total no. of diabetic and non-diabetic class
print((df.count(),len(df.columns)))
df.groupBy('Outcome').count().show()

(2004, 9)
+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  684|
|      0| 1316|
|   NULL|    4|
+-------+-----+



In [49]:
#get the summary statistics
df.describe().show()


+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|       Pregnancies|           Glucose|    BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|  count|              2004|              2004|             2004|              2004|              2004|              2004|                    2004|              2004|              2000|
|   mean|3.6981037924151696|121.19910179640719|69.16067864271457|  20.9750499001996| 80.59530938123753|31.786427145708583|      0.0718562874251497| 33.09231536926148|             0.342|
| stddev|3.3051158392920073| 32.08984484192234|19.17567654301146|16.11

# TASK 3: Data Cleaning & Preparation

In [None]:
#check for null values
for col in df.columns:
  print(col+":",df[df[col].isNull()].count())

Pregnancies: 0
Glucose: 0
BloodPressure: 0
SkinThickness: 0
Insulin: 0
BMI: 0
DiabetesPedigreeFunction: 0
Age: 0
Outcome: 4


In [51]:
#look for the unnecessary values present
def count_zeros():
  columns_list =['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
  for i in columns_list:
    print(i+":",df[df[i]==0].count())

In [50]:
count_zeros()

Glucose: 13
BloodPressure: 90
SkinThickness: 573
Insulin: 956
BMI: 28


In [52]:
#calculate and replace the unnecessary values by the mean value
from pyspark.sql.functions import *
for i in df.columns[1:6]:
  data = df.agg({i:'mean'}).first()[0]
  print("mean value for {} is {}".format(i,int(data)))
  df =  df.withColumn(i,when(df[i]==0, int(data)).otherwise(df[i]))


mean value for Glucose is 121
mean value for BloodPressure is 69
mean value for SkinThickness is 20
mean value for Insulin is 80
mean value for BMI is 31


In [53]:
#display the dataframe
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|     80|33.0|                     0.0| 47|      1|
|          0|     84|           82|           31|    125|38.0|                     0.0| 23|      0|
|          0|    145|           69|           20|     80|44.0|                     0.0| 31|      1|
|          0|    135|           68|           42|    250|42.0|                     0.0| 24|      1|
|          1|    139|           62|           41|    480|40.0|                     0.0| 21|      0|
|          0|    173|           78|           32|    265|46.0|                     1.0| 58|      0|
|          4|     99|           72|           17|     80|25.0|                     0.0| 28|      0|


In [54]:
from pyspark.ml.stat import Correlation


# TASK 4: Correlation Analysis & Feature Selection

In [55]:
#find the correlation among the set of input & output variables
for i in df.columns:
  print("Correlation to outcome for {} is {}".format(i,df.stat.corr('Outcome',i)))

Correlation to outcome for Pregnancies is 0.22533636714375882
Correlation to outcome for Glucose is 0.48647337338253194
Correlation to outcome for BloodPressure is 0.17097867568222685
Correlation to outcome for SkinThickness is 0.16340516177484238
Correlation to outcome for Insulin is 0.1681217628900413
Correlation to outcome for BMI is 0.275793932194371
Correlation to outcome for DiabetesPedigreeFunction is 0.12266060171003573
Correlation to outcome for Age is 0.23615522082850302
Correlation to outcome for Outcome is 1.0


In [56]:
#feature selection
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols= ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'],outputCol ='features')
output_data = assembler.transform(df)


In [57]:
#print the schema
output_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)
 |-- features: vector (nullable = true)



In [58]:
#display dataframe
output_data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          2|    138|           62|           35|     80|33.0|                     0.0| 47|      1|[2.0,138.0,62.0,3...|
|          0|     84|           82|           31|    125|38.0|                     0.0| 23|      0|[0.0,84.0,82.0,31...|
|          0|    145|           69|           20|     80|44.0|                     0.0| 31|      1|[0.0,145.0,69.0,2...|
|          0|    135|           68|           42|    250|42.0|                     0.0| 24|      1|[0.0,135.0,68.0,4...|
|          1|    139|           62|           41|    480|40.0|                     0.0| 21|      0|[1.0,139.0,62.0,4...|
|          0|    173|           

# TASK 5: Split Dataset & Build the Model

In [60]:
#create final data
from pyspark.ml.classification import LogisticRegression
final_data = output_data.select('features','Outcome')


In [66]:
final_data = final_data.dropna(subset=['Outcome'])


In [68]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Outcome: integer (nullable = true)



In [69]:
#split the dataset ; build the model
train, test = final_data.randomSplit([0.7,0.3])
models = LogisticRegression(labelCol= 'Outcome')
model = models.fit(train)

In [71]:
#summary of the model
summary  = model.summary
summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|            Outcome|         prediction|
+-------+-------------------+-------------------+
|  count|               1382|               1382|
|   mean|  0.341534008683068| 0.2727930535455861|
| stddev|0.47439579919634867|0.44555656301121926|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



# TASK 6: Evaluate and Save the Model

In [73]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test)
predictions.predictions.show(20)

+--------------------+-------+--------------------+--------------------+----------+
|            features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[0.0,73.0,69.0,20...|      0|[4.04764980549776...|[0.98283636610618...|       0.0|
|[0.0,73.0,69.0,20...|      0|[4.04764980549776...|[0.98283636610618...|       0.0|
|[0.0,84.0,64.0,22...|      0|[2.55069412226439...|[0.92762013255215...|       0.0|
|[0.0,84.0,64.0,22...|      0|[2.55069412226439...|[0.92762013255215...|       0.0|
|[0.0,84.0,82.0,31...|      0|[2.45423936287827...|[0.92087091501365...|       0.0|
|[0.0,84.0,82.0,31...|      0|[2.45423936287827...|[0.92087091501365...|       0.0|
|[0.0,91.0,68.0,32...|      0|[2.01000181448480...|[0.88184321125248...|       0.0|
|[0.0,93.0,60.0,20...|      0|[2.14517728524377...|[0.89521725169756...|       0.0|
|[0.0,93.0,60.0,25...|      0|[2.76524552600138...|[0.94076860735234...|    

In [74]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol= 'Outcome')
evaluator.evaluate(model.transform(test))

0.8392160052049452

In [75]:
# save model
model.save("model")

In [77]:
# load saved model back to the environment
from pyspark.ml.classification import LogisticRegressionModel
model = LogisticRegressionModel.load('model')

# TASK 7: Prediction on New Data with the saved model

In [78]:
#create a new spark dataframe
test_df =  spark.read.csv("/content/diabetes_dataset/new_test.csv",header = True, inferSchema=True)

In [79]:
#print the schema
test_df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)



In [80]:
#create an additional feature merged column
test_data = assembler.transform(test_df)

In [81]:
#print the schema
test_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- features: vector (nullable = true)



In [82]:
#use model to make predictions
results = model.transform(test_data)
results.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [83]:
#display the predictions
results.select('features','prediction').show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[1.0,190.0,78.0,3...|       1.0|
|[0.0,80.0,84.0,36...|       0.0|
|[2.0,138.0,82.0,4...|       1.0|
|[1.0,110.0,63.0,4...|       1.0|
+--------------------+----------+

