In [1]:
import os
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkDates").getOrCreate()

from pyspark import SparkFiles
url= 'https://raw.githubusercontent.com/Nathanhans/project-4/main/heart_disease_dataset.csv'
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("heart_disease_dataset.csv"), sep=",", header=True, inferSchema=True)
df.show()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.173.52)] [Connecting to ppa.laun                                                                                                    Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.173.52)] [Connecting to ppa.laun                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.173.52)] [Connecting to ppa.laun                                                                                                    Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 https://developer.download.nvidia.c

In [2]:
# Show datatypes
df.printSchema()

root
 |-- State: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- GeneralHealth: string (nullable = true)
 |-- PhysicalHealthDays: integer (nullable = true)
 |-- MentalHealthDays: integer (nullable = true)
 |-- LastCheckupTime: string (nullable = true)
 |-- PhysicalActivities: string (nullable = true)
 |-- SleepHours: integer (nullable = true)
 |-- RemovedTeeth: string (nullable = true)
 |-- HadHeartAttack: string (nullable = true)
 |-- HadAngina: string (nullable = true)
 |-- HadStroke: string (nullable = true)
 |-- HadAsthma: string (nullable = true)
 |-- HadSkinCancer: string (nullable = true)
 |-- HadCOPD: string (nullable = true)
 |-- HadDepressiveDisorder: string (nullable = true)
 |-- HadKidneyDisease: string (nullable = true)
 |-- HadArthritis: string (nullable = true)
 |-- HadDiabetes: string (nullable = true)
 |-- DeafOrHardOfHearing: string (nullable = true)
 |-- BlindOrVisionDifficulty: string (nullable = true)
 |-- DifficultyConcentrating: string (nullable 

In [3]:
# Find average BMI by state
averages = df.groupBy("state").avg()
averages.orderBy("avg(BMI)").select("State", "avg(BMI)").show(averages.count())

+--------------------+------------------+
|               State|          avg(BMI)|
+--------------------+------------------+
|              Hawaii|   27.037469621158|
|District of Columbia| 27.11413913043479|
|            Colorado| 27.46224849777092|
|             Vermont| 27.63499071207437|
|       Massachusetts| 27.76315279048493|
|          California|27.785286499215108|
|          New Jersey|27.966909503403116|
|       New Hampshire|28.033181576144834|
|        Rhode Island|28.142843830334275|
|            New York|28.163643393477628|
|                Guam| 28.19103292446739|
|         Connecticut|28.219651692064165|
|              Nevada|28.224104013566976|
|                Utah|28.260245672808576|
|          New Mexico|28.283763477089014|
|          Washington|28.299338666667143|
|             Montana|28.299645872420278|
|             Arizona| 28.37178689124863|
|              Alaska|28.397978159126318|
|              Oregon|28.454247294194804|
|             Wyoming|28.476531120

In [4]:
# Find average BMI and hours of Sleep by HadHeartAttack value
HA_averages = df.groupBy("HadHeartAttack").avg()
HA_averages.orderBy("avg(BMI)").select("HadHeartAttack", "avg(BMI)", "avg(SleepHours)").show(HA_averages.count())

+--------------+-----------------+-----------------+
|HadHeartAttack|         avg(BMI)|  avg(SleepHours)|
+--------------+-----------------+-----------------+
|            No|28.62052148228253|7.020074208790689|
|           Yes|29.49243542984752|7.043096390026052|
+--------------+-----------------+-----------------+

