In [None]:
# Install PySpark

!pip install pyspark



In [None]:

# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when, stddev, min, max

# Create a Spark session
spark = SparkSession.builder.appName("DiabetesAnalysis").getOrCreate()

# Load the CSV file into a Spark DataFrame
file_path = "/content/diabetes.csv"  # Change this path if needed
diabetes_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show data structure
diabetes_df.printSchema()
diabetes_df.show(5)

# Register the DataFrame as a SQL table
diabetes_df.createOrReplaceTempView("diabetes")


# ------------------------------
# 🔹 2. Distribution of Diabetic vs Non-Diabetic Patients
# ------------------------------
spark.sql("""
    SELECT Outcome, COUNT(*) AS Count
    FROM diabetes
    GROUP BY Outcome
""").show()

# ------------------------------
# 🔹 3. Average Glucose & BMI by Diabetes Outcome
# ------------------------------
spark.sql("""
    SELECT Outcome,
           AVG(Glucose) AS AvgGlucose,
           AVG(BMI) AS AvgBMI
    FROM diabetes
    GROUP BY Outcome
""").show()

# ------------------------------
# 🔹 4. High-Risk Patients: Glucose > 140 & BMI > 30
# ------------------------------
spark.sql("""
    SELECT COUNT(*) AS HighRiskPatients
    FROM diabetes
    WHERE Glucose > 140 AND BMI > 30
""").show()

# ------------------------------
# 🔹 5. Correlation Analysis (Glucose & BMI)
# ------------------------------
spark.sql("""
    SELECT
        CORR(Glucose, BMI) AS Correlation_Glucose_BMI
    FROM diabetes
""").show()

# ------------------------------
# 🔹 6. Age Group Analysis (Diabetic vs Non-Diabetic)
# ------------------------------
spark.sql("""
    SELECT
        CASE
            WHEN Age < 30 THEN 'Below 30'
            WHEN Age BETWEEN 30 AND 50 THEN '30-50'
            ELSE 'Above 50'
        END AS AgeGroup,
        Outcome,
        COUNT(*) AS Count
    FROM diabetes
    GROUP BY AgeGroup, Outcome
    ORDER BY AgeGroup, Outcome
""").show()

# ------------------------------
# 🔹 7. Standard Deviation of Features
# ------------------------------
spark.sql("""
    SELECT
        STDDEV(Glucose) AS StdGlucose,
        STDDEV(BMI) AS StdBMI,
        STDDEV(Age) AS StdAge
    FROM diabetes
""").show()

# ------------------------------
# 🔹 8. Handling Missing or Zero Values (Replacing with Mean)
# ------------------------------
diabetes_df = diabetes_df.withColumn("Glucose", when(col("Glucose") == 0, None).otherwise(col("Glucose")))
diabetes_df = diabetes_df.withColumn("BMI", when(col("BMI") == 0, None).otherwise(col("BMI")))
diabetes_df = diabetes_df.na.fill({"Glucose": diabetes_df.select(avg("Glucose")).collect()[0][0],
                                   "BMI": diabetes_df.select(avg("BMI")).collect()[0][0]})
diabetes_df.createOrReplaceTempView("diabetes_cleaned")

# ------------------------------
# 🔹 9. Min & Max Values for Glucose and BMI
# ------------------------------
spark.sql("""
    SELECT
        MIN(Glucose) AS MinGlucose,
        MAX(Glucose) AS MaxGlucose,
        MIN(BMI) AS MinBMI,
        MAX(BMI) AS MaxBMI
    FROM diabetes_cleaned
""").show()

# ------------------------------
# 🔹 10. Save Processed Data
# ------------------------------
output_path = "/content/cleaned_diabetes_data"
diabetes_df.write.mode("overwrite").parquet(output_path)

# Stop Spark session
spark.stop()


root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          