In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row
import matplotlib.pyplot as plt
spark = SparkSession.builder.getOrCreate()
import pyspark.sql.functions as f
from pyspark.sql.functions import *
sc = spark.sparkContext

In [2]:
# Functionality for computing features
from pyspark.ml import feature
# Functionality for regression
from pyspark.ml import regression
# Funcionality for classification
from pyspark.ml import classification
# Object for creating sequences of transformations
from pyspark.ml import Pipeline

In [3]:
diabetic_df = spark.read.csv('Data/diabetic_data.csv', header = True, inferSchema = True)

In [4]:
ID_df = spark.read.csv('Data/IDs_mapping.csv', header = True, inferSchema = True)

In [5]:
diabetic_df.join(ID_df, on = 'admission_type_id').toPandas().head(5)

Unnamed: 0,admission_type_id,encounter_id,patient_nbr,race,gender,age,weight,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,description
0,6,2278392,8222157,Caucasian,Female,[0-10),?,25,1,1,...,No,No,No,No,No,No,No,No,NO,Transfer from another health care facility
1,6,2278392,8222157,Caucasian,Female,[0-10),?,25,1,1,...,No,No,No,No,No,No,No,No,NO,Discharged/transferred to home with home healt...
2,6,2278392,8222157,Caucasian,Female,[0-10),?,25,1,1,...,No,No,No,No,No,No,No,No,NO,
3,1,149190,55629189,Caucasian,Female,[10-20),?,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,Physician Referral
4,1,149190,55629189,Caucasian,Female,[10-20),?,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,Discharged to home


In [6]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark.sql import functions as F

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
ID_df.createOrReplaceTempView("ID_Df")
diabetic_df.createOrReplaceTempView("diabetes_df")
# Read Table from hive 
ID = sqlContext.sql(' select * from ID_df')
ID.show(5)

diabetes = sqlContext.sql(' select * from diabetes_df')
diabetes.show(5)

+-----------------+-------------+
|admission_type_id|  description|
+-----------------+-------------+
|                1|    Emergency|
|                2|       Urgent|
|                3|     Elective|
|                4|      Newborn|
|                5|Not Available|
+-----------------+-------------+
only showing top 5 rows

+------------+-----------+---------------+------+-------+------+-----------------+------------------------+-------------------+----------------+----------+--------------------+------------------+--------------+---------------+-----------------+----------------+----------------+------+------+------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+------

In [7]:
df = spark.read.csv('Data/data.csv', header = True, inferSchema = True)

In [8]:
print("Shape of the Data:")
print((df.count(), len(df.columns)))

Shape of the Data:
(101766, 53)


In [9]:
df.show(1)

+------------+-----------+---------+------+------+------+-----------------+-------------------+------------------------+-------------------------+-------------------+---------------------+----------------+----------+--------------------+------------------+--------------+---------------+-----------------+----------------+----------------+------+------+------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+-----------+----------+
|encounter_id|patient_nbr|     race|gender|   age|weight|admission_type_id|admission_type_name|discharge_disposition_id|dischage_disposition_name|admission_source_id|admission_source_name|time_in_hospital|payer_code|   medical_specialty|num_lab_procedu

In [None]:
# Check for NAN
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas()

In [None]:
# Check for NULL
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).toPandas()

In [None]:
# Check for ?


In [None]:
# Checking for missing, max, std for numerical variables
c = ['num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient',\
     'diag_1', 'diag_2', 'diag_3']
df.select(['num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient',\
     'diag_1', 'diag_2', 'diag_3']).describe().toPandas()

In [None]:
# Frequency Table for Categorical Variables
freq_table = df.select(col("readmitted").cast("string")).groupBy("readmitted").count().toPandas()
freq_table

In [None]:
# Importing plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Show histogram of the 'admission_type_name' column
d = df.groupBy("admission_type_name").count().toPandas()


height = d['count']
bars = d['admission_type_name']
y_pos = np.arange(len(bars))
 
# Create bars
plt.bar(y_pos, height)

# Create names on the x-axis
plt.xticks(y_pos, bars, rotation = 90)

# Adding Title
plt.title('Admission Type')
 
# Show graphic
plt.show()



In [None]:
# Show histogram of the 'dischage_disposition_name' column
d = df.groupBy("dischage_disposition_name").count().toPandas()


height = d['count']
bars = d['dischage_disposition_name']
y_pos = np.arange(len(bars))
 
# Create bars
plt.bar(y_pos, height)

# Create names on the x-axis
plt.xticks(y_pos, bars, rotation = 90)

# Adding Title
plt.title('Discharge Name')
 
# Show graphic
plt.show()




In [None]:
# Show histogram of the 'admission_source_name' column
d = df.groupBy("admission_source_name").count().toPandas()


height = d['count']
bars = d['admission_source_name']
y_pos = np.arange(len(bars))
 
# Create bars
plt.bar(y_pos, height)

# Create names on the x-axis
plt.xticks(y_pos, bars, rotation = 90)

# Adding Title
plt.title('Admission Source')
 
# Show graphic
plt.show()



In [None]:
# Number of Females and Male Patients
d = df.groupBy('race').count().toPandas()

height = d['count'][:-1]
values = d['race'][:-1]

# Create bars
plt.bar(values, height)

# Create names on the x-axis
plt.xticks(rotation = 90)

# Adding Title
plt.title('Race of Admitted People')
 
# Show graphic
plt.show()
