In [1]:
##importing the package
import pyspark

In [2]:
##importing the sparksession package
from pyspark.sql import SparkSession

In [3]:

##Create a spark session under the name of Hands-on
spark = SparkSession.builder.appName('Hands-on').getOrCreate()

In [4]:
spark

In [5]:
##reading the csv file
df_pyspark = spark.read.csv('Heart_Disease_Prediction.csv')

In [6]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]

In [7]:
df_pyspark.show()

+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+--------------------+--------+-------------+
|_c0|_c1|            _c2|_c3|        _c4|         _c5|        _c6|   _c7|            _c8|          _c9|       _c10|                _c11|    _c12|         _c13|
+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+--------------------+--------+-------------+
|Age|Sex|Chest pain type| BP|Cholesterol|FBS over 120|EKG results|Max HR|Exercise angina|ST depression|Slope of ST|Number of vessels...|Thallium|Heart Disease|
| 70|  1|              4|130|        322|           0|          2|   109|              0|          2.4|          2|                   3|       3|     Presence|
| 67|  0|              3|115|        564|           0|          2|   160|              0|          1.6|          2|                   0|       7|      Absence|
| 57|  1|              2|124|        261

In [8]:
##previous way of reading the csv file create it own header. To use the first row of the file as the headers,
df_pyspark = spark.read.option('header', 'true').csv('Heart_Disease_Prediction.csv')

In [9]:
df_pyspark.show()

+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
|Age|Sex|Chest pain type| BP|Cholesterol|FBS over 120|EKG results|Max HR|Exercise angina|ST depression|Slope of ST|Number of vessels fluro|Thallium|Heart Disease|
+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
| 70|  1|              4|130|        322|           0|          2|   109|              0|          2.4|          2|                      3|       3|     Presence|
| 67|  0|              3|115|        564|           0|          2|   160|              0|          1.6|          2|                      0|       7|      Absence|
| 57|  1|              2|124|        261|           0|          0|   141|              0|          0.3|          1|                      0|       7|     Presence|
| 64|  1|             

In [10]:
df_pyspark.printSchema()

##As you can see all the columns has string as the datatype

root
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Chest pain type: string (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholesterol: string (nullable = true)
 |-- FBS over 120: string (nullable = true)
 |-- EKG results: string (nullable = true)
 |-- Max HR: string (nullable = true)
 |-- Exercise angina: string (nullable = true)
 |-- ST depression: string (nullable = true)
 |-- Slope of ST: string (nullable = true)
 |-- Number of vessels fluro: string (nullable = true)
 |-- Thallium: string (nullable = true)
 |-- Heart Disease: string (nullable = true)



In [11]:
##to fix that need to pass the inferSchema paramater as true
df_pyspark = spark.read.option('header', 'true').csv('Heart_Disease_Prediction.csv', inferSchema = True)

In [12]:
df_pyspark.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: integer (nullable = true)
 |-- Chest pain type: integer (nullable = true)
 |-- BP: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FBS over 120: integer (nullable = true)
 |-- EKG results: integer (nullable = true)
 |-- Max HR: integer (nullable = true)
 |-- Exercise angina: integer (nullable = true)
 |-- ST depression: double (nullable = true)
 |-- Slope of ST: integer (nullable = true)
 |-- Number of vessels fluro: integer (nullable = true)
 |-- Thallium: integer (nullable = true)
 |-- Heart Disease: string (nullable = true)



In [13]:
##selecting column/s
df_pyspark.select(['Chest pain type', 'Heart Disease']).show()

+---------------+-------------+
|Chest pain type|Heart Disease|
+---------------+-------------+
|              4|     Presence|
|              3|      Absence|
|              2|     Presence|
|              4|      Absence|
|              2|      Absence|
|              4|      Absence|
|              3|     Presence|
|              4|     Presence|
|              4|     Presence|
|              4|     Presence|
|              4|      Absence|
|              4|      Absence|
|              3|      Absence|
|              1|     Presence|
|              4|      Absence|
|              4|      Absence|
|              4|     Presence|
|              4|     Presence|
|              1|      Absence|
|              1|      Absence|
+---------------+-------------+
only showing top 20 rows



In [14]:
##Can use withColumn function to add new columns.
df_pyspark.withColumn('Test Columns', df_pyspark['Chest pain type']*2).show()

+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+------------+
|Age|Sex|Chest pain type| BP|Cholesterol|FBS over 120|EKG results|Max HR|Exercise angina|ST depression|Slope of ST|Number of vessels fluro|Thallium|Heart Disease|Test Columns|
+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+------------+
| 70|  1|              4|130|        322|           0|          2|   109|              0|          2.4|          2|                      3|       3|     Presence|           8|
| 67|  0|              3|115|        564|           0|          2|   160|              0|          1.6|          2|                      0|       7|      Absence|           6|
| 57|  1|              2|124|        261|           0|          0|   141|              0|          0.3|          1|     

In [15]:
##use drop function to drop columns
df_pyspark.drop('Test columns')

DataFrame[Age: int, Sex: int, Chest pain type: int, BP: int, Cholesterol: int, FBS over 120: int, EKG results: int, Max HR: int, Exercise angina: int, ST depression: double, Slope of ST: int, Number of vessels fluro: int, Thallium: int, Heart Disease: string]

In [16]:
##Renaming columns
df_pyspark.withColumnRenamed('BP', 'Blood Pressure')

DataFrame[Age: int, Sex: int, Chest pain type: int, Blood Pressure: int, Cholesterol: int, FBS over 120: int, EKG results: int, Max HR: int, Exercise angina: int, ST depression: double, Slope of ST: int, Number of vessels fluro: int, Thallium: int, Heart Disease: string]

In [None]:
##filter function is to filter out records using some conditions.
## ~ use as the NOT operator
df_pyspark.filter((df_pyspark['Chest Pain Type'] < 2) & (df_pyspark['Heart Disease'] == 'Presence')).show()

+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
|Age|Sex|Chest pain type| BP|Cholesterol|FBS over 120|EKG results|Max HR|Exercise angina|ST depression|Slope of ST|Number of vessels fluro|Thallium|Heart Disease|
+---+---+---------------+---+-----------+------------+-----------+------+---------------+-------------+-----------+-----------------------+--------+-------------+
| 61|  1|              1|134|        234|           0|          0|   145|              0|          2.6|          2|                      2|       3|     Presence|
| 59|  1|              1|160|        273|           0|          2|   125|              0|          0.0|          1|                      0|       3|     Presence|
| 38|  1|              1|120|        231|           0|          0|   182|              1|          3.8|          2|                      0|       7|     Presence|
| 65|  1|             