In [65]:
# import necessary libraries

# loading and reading data
import numpy as np
import pandas as pd

# libraries for visualizations
import seaborn as sns
import matplotlib.pyplot as plt

In [66]:
# load the data set(maternal_health data)

df = pd.read_csv('Maternal Health Risk Data Set.csv')


In [67]:
# Check the first 5 columns of the data set
df.head(5)

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


### Data Features explanation
|Feature	|Description	                                                                    |Unit      |
|-----------|-----------------------------------------------------------------------------------|----------|
|Age	    |Ages in years when a woman is pregnant	                                            |          |
|SystolicBP	|Upper value of Blood Pressure                                                      |mmHg      |
|DiastolicBP|Lower value of Blood Pressure	                                                    |mmHg      |
|BS	        |Blood glucose levels is in terms of a molar concentration	                        |mmol/L    |
|BodyTerm	|Bodytemperature	                                                                |Fahrenheit|
|HeartRate	|A normal resting heart rate	                                                    |bpm       |
|RiskLevel	|Predicted Risk Intensity Level during pregnancy considering the previous attribute |          |	


## 1. Data Overview

- Data Stucture
- Data Quality check
- Data Descriptive analysis

### 1.1 Data Structure

| Task           | Objective                                             |
|----------------|-------------------------------------------------------|
| Data Dimension | To know the number of columns and rows of the dataset |
| Data Types     | To verify that the data types align with the features |
| Data Columns   | To list and understand the columns                    |


### 1.1.0 Data Structure code line

In [68]:
# checking the dimensionality of the data set
print(f" Data set Dimensions: {df.shape}")

print('---------------------------------------')
# Checking the data types
print(f" Data types:\n{df.dtypes}")

print('----------------------------------------')
# check the data columns
print(f" Data columns: {df.columns}")


 Data set Dimensions: (1014, 7)
---------------------------------------
 Data types:
Age              int64
SystolicBP       int64
DiastolicBP      int64
BS             float64
BodyTemp       float64
HeartRate        int64
RiskLevel       object
dtype: object
----------------------------------------
 Data columns: Index(['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate',
       'RiskLevel'],
      dtype='object')


### 1.2 Data Quality Check

| Task        |Objective                                              |
|-------------|-------------------------------------------------------|
|missing value| identify missing data and understand its extent       |
|Data Range   | verify numerical data is within expected bounds       |

### 1.2.1 Code for Data Quality Check

In [69]:
# check for missing value
print(f"missing value: {df.isnull().sum()}")

print('----------------------------------------')
# check for data range and validity
num_columns = df.select_dtypes(include=['number']).columns
for col in num_columns:
    print(f"{col}: min = {df[col].min()}, max = {df[col].max()}")




missing value: Age            0
SystolicBP     0
DiastolicBP    0
BS             0
BodyTemp       0
HeartRate      0
RiskLevel      0
dtype: int64
----------------------------------------
Age: min = 10, max = 70
SystolicBP: min = 70, max = 160
DiastolicBP: min = 49, max = 100
BS: min = 6.0, max = 19.0
BodyTemp: min = 98.0, max = 103.0
HeartRate: min = 7, max = 90


### 1.3 Data descriptive analysis

| Task                             | Objective                                                                          |
|----------------------------------|------------------------------------------------------------------------------------|
|statistics of numerical features  | quick overview of the distribution and central tendencies of the numerical features|
|statistics of categorical features| understand class distribution of categorical variable                              |

### 1.3.1 Code for Data Statistics distribution

In [70]:
# check the summary statistics of numerical features
print('The summary Statistics of numerical features:')
df.describe()

The summary Statistics of numerical features:


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
count,1014.0,1014.0,1014.0,1014.0,1014.0,1014.0
mean,29.871795,113.198225,76.460552,8.725986,98.665089,74.301775
std,13.474386,18.403913,13.885796,3.293532,1.371384,8.088702
min,10.0,70.0,49.0,6.0,98.0,7.0
25%,19.0,100.0,65.0,6.9,98.0,70.0
50%,26.0,120.0,80.0,7.5,98.0,76.0
75%,39.0,120.0,90.0,8.0,98.0,80.0
max,70.0,160.0,100.0,19.0,103.0,90.0


In [71]:
# check the summary statistics of the class distribution of the target variable
print(df.describe(include='object'))
# Value counts for categorical features
print("\nCategorical Data Insights:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col} frequency:")
    print(df[col].value_counts())



       RiskLevel
count       1014
unique         3
top     low risk
freq         406

Categorical Data Insights:
RiskLevel frequency:
low risk     406
mid risk     336
high risk    272
Name: RiskLevel, dtype: int64


### 1.3.1 Further analysis on the numerical features

In [72]:
# Skewness and Kurtosis
print("\nSkewness and Kurtosis:")
for col in df.select_dtypes(include=['number']).columns:
    skewness_value = round(df[col].skew(),2)
    kurtosis_value = round(df[col].kurt(),2)
    print(f"{col}: Skewness = {skewness_value}, Kurtosis = {kurtosis_value}")


Skewness and Kurtosis:
Age: Skewness = 0.78, Kurtosis = -0.39
SystolicBP: Skewness = -0.25, Kurtosis = -0.61
DiastolicBP: Skewness = -0.05, Kurtosis = -0.95
BS: Skewness = 1.87, Kurtosis = 2.3
BodyTemp: Skewness = 1.75, Kurtosis = 1.45
HeartRate: Skewness = -1.04, Kurtosis = 8.4


 - Explanation of Skewness
 1) Age, BS, BodyTemp: positvely skewed, suggesting the presence of higher values pulling the tail of the distribution to the right
 2) HeartRate: Negatively skewed which indicates some low values extending the tail to the left.
 
 - Explanation of Kurtosis
 1) BS, BodyTemp, HeartRate: High positve kurtosis which indicates some extreme values or presence of outliers
 2) Age, SystolicBP, DiastolicBP: Negative kurtosis which suggests a flatter distribution with minimum outliers

## 2. Enhancement of Data
- Additional columns will be created using existing features:
1) Mean Arterial Pressure(MAP): The MAP helps in monitoring hypertension and ensuring adequate organ perfusion during pregnancy
2) Pulse Pressure(PP): Provides insights into the arterial stiffness and cardiovascular risk.

- The Target variable(RiskLevel) will be changed from categorical data type to numerical data type

### 2.1 Code for Enhancement of Data

In [73]:
# Calculate the Mean Arterial Pressure(MAP)
df['MAP'] = round((df['SystolicBP'] +2 * df['DiastolicBP'])/3,2)

# Calculate Pulse Pressure (PP)
df['Pulse'] = df['SystolicBP'] - df['DiastolicBP']

# Map the Target Variable
Risk = {'low risk': 0, 'mid risk': 1, 'high risk': 2}
df['RiskLevel'] = df['RiskLevel'].map(Risk).astype(float)

# checking the enhanced data set
print('\nEnhanced Data set')
print(df.head(5))


Enhanced Data set
   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate  RiskLevel     MAP  \
0   25         130           80  15.0      98.0         86        2.0   96.67   
1   35         140           90  13.0      98.0         70        2.0  106.67   
2   29          90           70   8.0     100.0         80        2.0   76.67   
3   30         140           85   7.0      98.0         70        2.0  103.33   
4   35         120           60   6.1      98.0         76        0.0   80.00   

   Pulse  
0     50  
1     50  
2     20  
3     55  
4     60  
