<a href="https://colab.research.google.com/github/JeevithaR3/Big_Data_Analytics_Disease_Prediction/blob/main/BDA_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pandas as pd

In [None]:
spark = SparkSession.builder.appName("DiabetesProbability").getOrCreate()

In [None]:
data = spark.read.csv("/content/diabetes.csv", header=True, inferSchema=True) #loads it to a dataframe
data = data.na.fill(0) #fills all the missing values with 0
input_cols = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"] #lists which features u would want to use
assembler = VectorAssembler(inputCols=input_cols, outputCol="features_raw") #
assembled = assembler.transform(data)

In [None]:
scaler = StandardScaler(inputCol="features_raw", outputCol="features") #noramlization, scales all feature values so that large numbers don't overshadow the smaller ones
scaler_model = scaler.fit(assembled)
scaled_data = scaler_model.transform(assembled)
train_data, test_data = scaled_data.randomSplit([0.7, 0.3], seed=42)

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="Outcome")
lr_model = lr.fit(train_data)
rf = RandomForestClassifier(featuresCol="features", labelCol="Outcome", numTrees=50)
rf_model = rf.fit(train_data)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="Outcome")
lr_pred = lr_model.transform(test_data)
rf_pred = rf_model.transform(test_data)
lr_acc = evaluator.evaluate(lr_pred)
rf_acc = evaluator.evaluate(rf_pred)
print(f"Logistic Regression AUC: {lr_acc:.3f}")
print(f"Random Forest AUC: {rf_acc:.3f}")

Logistic Regression AUC: 0.851
Random Forest AUC: 0.840


In [None]:
feature_order = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]

In [None]:
user_features = [(2, 120, 70, 22, 80, 25.0, 0.4, 33)] # Replace with desired values
user_df = spark.createDataFrame(user_features, feature_order)
user_assembled = assembler.transform(user_df)
user_scaled = scaler_model.transform(user_assembled)

In [None]:
chosen_model = lr_model if lr_acc > rf_acc else rf_model

user_pred = chosen_model.transform(user_scaled)
user_prob = user_pred.select("probability").collect()[0][0][1]
print(f"Predicted Probability of Diabetes: {user_prob:.2%}")

Predicted Probability of Diabetes: 14.47%
