In [None]:
<center><h2> This Notebook explains basic cleaning process using the data collected from the Heart rate sensor lab</h2>

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

hrO2 = pd.read_csv("results.csv")
hrO2

hrO2.info()

## Step 1: Find out non-valid data in the data set
### Even though hr.info says every column is "non-null" but  the HRValue and SpO2value columns have some non-numbers.
### The data type for both are "object"!!

# How many valid data in both columns? 
# We can check the HRvalid and SpO2valid columns.

print ('There are',hrO2['HRvalid'].sum(), 'valid data')
print ('There are',hrO2['SpO2valid'].sum(), 'valid data')

### Use String method str.isnumeric() to find non-numerical values. However, it does not work all the time.....

# Without the HRvalid and SpO2valid columns, we need to check the
# data directly using pd.Series.str.isnumeric() 

hrO2['HRvalue'].str.isnumeric().sum()

# However, this does not work with SpO2value. It appears the entire column
# data are cast as a Python object
hrO2['SpO2value'].str.isnumeric().sum()

### Use the pd.to_numeric( ) method with an option errors='coerce' to cast the column to float64 and set the non-numerical values to NaN

hr_1 = pd.to_numeric(hrO2['HRvalue'], errors='coerce')
hr_1

### The row index with NaN is removed

hr_1.dropna(inplace=True)      # inplace=True will change hr_1 
hr_1

# Plot the bar chart
plt.bar(hr_1.index, hr_1)
plt.show()

### It looks good, but, did you notice there is one missing data at index 14? We need to reset the index to remove the missing indices

hr_1.reset_index(drop=True, inplace=True)    # drop=True will remove the old index
hr_1
plt.bar(hr_1.index, hr_1)
plt.show()

O2_1 = pd.to_numeric(hrO2['SpO2value'], errors='coerce')
O2_1

O2_1.dropna(inplace=True)
O2_1.reset_index(drop=True, inplace=True)
O2_1

plt.bar(O2_1.index, O2_1)
plt.show()

## Step 2: Find the outliers: data seem to be out of normal range of the variable

# Let's examine the values of HRvalue (hr_1)
hr_1.describe()

hr_1.value_counts()

O2_1.describe()

### A powerful visual is to use boxplot

![boxplot.png](attachment:boxplot.png)

plt.boxplot(hr_1)
plt.xlabel('Heart Rate')
plt.show()

### We can use Seaborn.boxplot as well

import seaborn as sns

sns.boxplot(y=hr_1)  ## you can try x=hr_1 and see the difference
plt.show()

### Historgram is another good chart to visualize the data distribution

plt.hist(hr_1)
plt.show()

### Heart rate over 150 while sitting is obviously unreal. Dropping from 100 to 50 in a short period of time does not make sense, either. 
### There are different ways to deal with outliers. 
### a. Remove it. or b. Replace it some other reasonable value

# For discussion purpose,let's consider HR > 150 and HR < 50 as outliers
# a. Remove the outliers

hr_2 = hr_1[(hr_1 <= 150) & (hr_1 >=50)]
sns.boxplot(y=hr_2)
plt.show()

# b. Replace the values with, say, the median value of the dataset

median = hr_1.median()
hr_1[hr_1 > 150] = median
hr_1[hr_1 < 50] = median
hr_1.describe()

sns.boxplot(y=hr_1)

O2_1.describe()

sns.boxplot(y=O2_1)
plt.show()

plt.hist(O2_1)
plt.show()

# Remove O2 value less than 0.80
O2_2 = O2_1[O2_1 > 80]
sns.boxplot(y=O2_2)
plt.show()

plt.hist(O2_2, bins=10)
plt.show()

