In [1]:
# ignoring some warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# importing the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

In [3]:
df = pd.read_csv('Patients_Dataset.csv')
df.head()

Unnamed: 0,id,name,gender,age,weight_kg,height_cm,education,marital,income,insurance,...,insulin,iron,u_acid,s_cotinine,cpk,ldh,fvc,fev1,fev1_fvc_ratio,memory
0,62632,Samual Sipes,male,29.0,168.0,186.0,preparatory,never married,5.0,no,...,106.48,63.0,7.2,159.0,251.0,170.0,5426.0,4434.0,0.817177,
1,63150,Beau Dach,male,19.0,90.0,200.0,,,15.0,yes,...,,83.0,7.3,14.7,99.0,102.0,7175.0,5786.0,0.806411,
2,71366,Youlanda Metz,female,20.0,75.0,174.0,college or equivalent,never married,3.0,yes,...,,118.0,4.1,0.011,211.0,142.0,5120.0,4570.0,0.892578,
3,63734,Deandre Walsh,male,43.0,102.0,181.0,college or equivalent,divorced,2.0,no,...,,55.0,4.7,0.766,312.0,109.0,,,,
4,65619,Rhett Douglas,male,20.0,91.0,170.0,secondary or equivalent,never married,8.0,no,...,,62.0,5.7,0.376,163.0,123.0,3870.0,3297.0,0.851938,


In [4]:
df.drop(['name'], axis=True, inplace=True)

# Data wrangling

## 1- replacing null cells in categorical features with unknown:

In [5]:
#gender
df['gender'][df.gender.isna()] = 'unknown'

#education
df['education'][df.education.isna()] = 'unknown'

#marital
df['marital'][df.marital.isna()] = 'unknown'

# income
df['income'][df.income.isna()] = 'unknown'
df['income'][df.income == 1] = '$0 to $4,999'
df['income'][df.income == 2] = '$5k to $9,999'
df['income'][df.income == 3] = '$10k to $14,999'
df['income'][df.income == 4] = '$15k to $19,999'
df['income'][df.income == 5] = '$20k to $24,999'
df['income'][df.income == 6] = '$25k to $34,999'
df['income'][df.income == 7] = '$35k to $44,999'
df['income'][df.income == 8] = '$45k to $54,999'
df['income'][df.income == 9] = '$55k to $64,999'
df['income'][df.income == 10] = '$65k to $74,999'
df['income'][df.income == 14] = '$75k to $99,999'
df['income'][df.income == 15] = '$100k and Over'

#insurance
df['insurance'][df.insurance.isna()] = 'unknown'

#gen_health
df['gen_health'][df.gen_health.isna()] = 'unknown'

#smoker
df['smoker'][df.smoker.isna()] = 'unknown'

#days_active
df['days_active'][df.days_active == 0.0] = '0'
df['days_active'][df.days_active == 1.0] = '1'
df['days_active'][df.days_active == 2.0] = '2'
df['days_active'][df.days_active == 3.0] = '3'
df['days_active'][df.days_active == 4.0] = '4'
df['days_active'][df.days_active == 5.0] = '5'
df['days_active'][df.days_active == 6.0] = '6'
df['days_active'][df.days_active == 7.0] = '7'
df['days_active'][df.days_active.isna()] = 'unknown'

#asthma
df['asthma'][df.asthma.isna()] = 'unknown'

#chf
df['chf'][df.chf.isna()] = 'unknown'

#cad
df['cad'][df.cad.isna()] = 'unknown'

#mi
df['mi'][df.mi.isna()] = 'unknown'

#cva
df['cva'][df.cva.isna()] = 'unknown'

#copd
df['copd'][df.copd.isna()] = 'unknown'

#cancer
df['cancer'][df.cancer.isna()] = 'unknown'

#  hypertension
df['hypertension'][df.hypertension.isna()] = 'unknown'

#diabetes
df['diabetes'][df.diabetes.isna()] = 'unknown'


## 2- Changing categorical ordinal data to type categoricalDtype:

In [6]:
#gender
Gend_levels = ['female', 'male', 'unknown']
gend_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Gend_levels)
df.gender = df.gender.astype(gend_levels)

#education
Edu_levels = ['postgraduate education', 'college or equivalent', 'secondary or equivalent',
             'preparatory', 'less than preparatory', 'unknown']
edu_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Edu_levels)
df.education = df.education.astype(edu_levels)

#marital
Mari_levels = ['married', 'widowed', 'divorced', 'separated', 'never married', 'unknown']
mari_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Mari_levels)
df.marital = df.marital.astype(mari_levels)

#income
Incm_levels = ['unknown', '$0 to $4,999', '$5k to $9,999', '$10k to $14,999', '$15k to $19,999', 
               '$20k to $24,999','$25k to $34,999', '$35k to $44,999', '$45k to $54,999', 
               '$55k to $64,999', '$65k to $74,999', '$75k to $99,999', '$100k and Over']
incm_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Incm_levels)
df.income = df.income.astype(incm_levels)

#insurance
Insur_levels = ['yes', 'no', 'unknown']
insur_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Insur_levels)
df.insurance = df.insurance.astype(insur_levels)

#gen_health
Genh_levels = ['excellent', 'very good', 'good', 'fair', 'poor', 'unknown']
genh_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Genh_levels)
df.gen_health = df.gen_health.astype(genh_levels)

#smoker
Smok_levels = ['yes', 'no', 'unknown']
smok_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Smok_levels)
df.smoker = df.smoker.astype(smok_levels)

#days_active
Dact_levels = ['0', '1', '2', '3', '4', '5', '6', '7', 'unknown']
dact_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Dact_levels)
df.days_active = df.days_active.astype(dact_levels)

#asthma
Asthma_levels = ['yes', 'no', 'unknown']
asthma_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Asthma_levels)
df.asthma = df.asthma.astype(asthma_levels)

#chf
Chf_levels = ['yes', 'no', 'unknown']
chf_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Chf_levels)
df.chf = df.chf.astype(chf_levels)

#cad
Cad_levels = ['yes', 'no', 'unknown']
cad_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Cad_levels)
df.cad = df.cad.astype(cad_levels)

#mi
Mi_levels = ['yes', 'no', 'unknown']
mi_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Mi_levels)
df.mi = df.mi.astype(mi_levels)

#cva
Cva_levels = ['yes', 'no', 'unknown']
cva_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Cva_levels)
df.cva = df.cva.astype(cva_levels)

#copd
Copd_levels = ['yes', 'no', 'unknown']
copd_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Copd_levels)
df.copd = df.copd.astype(copd_levels)

#cancer
Cancer_levels = ['yes', 'no', 'unknown']
cancer_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Cancer_levels)
df.cancer = df.cancer.astype(cancer_levels)

#hypertension
Hypertension_levels = ['yes', 'no', 'hypotension', 'unknown']
hypertension_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Hypertension_levels)
df.hypertension = df.hypertension.astype(hypertension_levels)

#diabetes
Diabetes_levels = ['yes', 'borderline', 'no', 'unknown']
diabetes_levels = pd.api.types.CategoricalDtype(ordered=True, categories=Diabetes_levels)
df.diabetes = df.diabetes.astype(diabetes_levels)

## 3- Clearing all wrong entries in the numerical features of the data

### 1- personal data

In [7]:
# nulling all bmi cells with value greater than 110 or less than 12
df['bmi'][df.bmi > 110] = np.nan
df['bmi'][df.bmi < 12] = np.nan

# nulling all waist_cm cells with value greater than 180 or less than 30
df['waist_cm'][df.waist_cm > 180] = np.nan
df['waist_cm'][df.waist_cm < 30] = np.nan

# nulling all drinks_day cells with value greater than 100 or less than 0
df['drinks_day'][df.drinks_day > 100] = np.nan
df['drinks_day'][df.drinks_day < 0] = np.nan

# nulling all weight_kg cells with value greater than 210 or less than 40
df['weight_kg'][df.weight_kg > 250] = np.nan
df['weight_kg'][df.weight_kg < 0] = np.nan

# nulling all height_cm cells with value greater than 210 or less than 40
df['height_cm'][df.height_cm > 210] = np.nan
df['height_cm'][df.height_cm < 40] = np.nan

### 2- heart rate and blood pressures

In [8]:
# nulling heart rate (pulse) cells with values more 140 or less than 30
df['pulse'][df.pulse > 140] = np.nan
df['pulse'][df.pulse < 30] = np.nan

# nulling blood pressure(sys_bp) cells with values more 250 or less than 65
df['sys_bp'][df.sys_bp > 250] = np.nan
df['sys_bp'][df.sys_bp < 65] = np.nan

# nulling blood pressure(dia_bp) cells with values more 150 or less than 40
df['dia_bp'][df.dia_bp > 150] = np.nan
df['dia_bp'][df.dia_bp < 40] = np.nan


# hypertension and pressure values
# First, we depend on the correct pressure values
df['hypertension'][(df.sys_bp >= 120) &(df.dia_bp >=80)] = 'yes'
df['hypertension'][(df.sys_bp < 120) & (df.sys_bp >= 90) &(df.dia_bp < 80) &(df.dia_bp >=60)] = 'no'
df['hypertension'][(df.sys_bp < 90) &(df.dia_bp < 60)] = 'hypotension'


right_hyper_ids = df['id'][((df.sys_bp >= 120) &(df.dia_bp >=80)) | 
                           ((df.sys_bp < 90) &(df.dia_bp < 60)) | 
                           ((df.sys_bp < 120) & (df.sys_bp >= 90) &(df.dia_bp < 80) &(df.dia_bp >=60))]

df['sys_bp'][~(df.id.isin(right_hyper_ids))] = np.nan
df['dia_bp'][~(df.id.isin(right_hyper_ids))] = np.nan

### 3- Medical Tests

#### a- CBC Test

In [9]:
# nulling white blood count(wbc) cells with values more 100 or less than 1
df['wbc'][df.wbc > 100] = np.nan
df['wbc'][df.wbc < 1] = np.nan

# nulling hemoglobin(hgb) cells with values more 100 or less than 1
df['hgb'][df.hgb > 25] = np.nan
df['hgb'][df.hgb < 5] = np.nan

# nulling hematocrit(hct) cells with values more 80 or less than 15
df['hct'][df.hct > 80] = np.nan
df['hct'][df.hct < 15] = np.nan

# nulling (platelets) cells with values more 80 or less than 5
df['platelets'][df.platelets > 750] = np.nan
df['platelets'][df.platelets < 5] = np.nan

#### b- Liver Function Test (LFT)

In [10]:
# nulling (alt) cells with values less than 1
df['alt'][df.alt > 2000] = np.nan
df['alt'][df.alt < 1] = np.nan

# nulling (ast) cells with values less than 1
df['ast'][df.ast > 2000] = np.nan
df['ast'][df.ast < 1] = np.nan

# nulling (alk_phos) cells with values less than 1
df['alk_phos'][df.alk_phos > 3500] = np.nan
df['alk_phos'][df.alk_phos < 5] = np.nan

#### c- Kidney Function Test (KFT)

In [11]:
# nulling (BUN) cells with values less than 1
df['bun'][df.bun > 110] = np.nan
df['bun'][df.bun < 1] = np.nan

# nulling (Cr) cells with values greater than 10 less than 0.1
df['cr'][df.cr > 10] = np.nan
df['cr'][df.cr < .1] = np.nan

#### d- Comprehensive metabolic panel (CMP)

In [12]:
# nulling (sodium) cells with values greater than 160 less than 90
df['sodium'][df.sodium > 160] = np.nan
df['sodium'][df.sodium < 90] = np.nan

# nulling (potassium) cells with values greater than 10 less than 1.5
df['potassium'][df.potassium > 10] = np.nan
df['potassium'][df.potassium < 1.5] = np.nan

# nulling (bicarb) cells with values greater than 45 less than 10
df['bicarb'][df.bicarb > 45] = np.nan
df['bicarb'][df.bicarb < 10] = np.nan

# nulling (calcium) cells with values greater than 15 less than 5
df['ca'][df.ca > 15] = np.nan
df['ca'][df.ca < 5] = np.nan

# nulling (phosphorus) cells with values greater than 10 less than 1
df['phos'][df.phos > 10] = np.nan
df['phos'][df.phos < 1] = np.nan

# nulling (t_bilirubin) cells with values greater than 5 less than 0
df['t_bilirubin'][df.t_bilirubin > 5] = np.nan
df['t_bilirubin'][df.t_bilirubin < 0] = np.nan

# nulling (alb) cells with values greater than 5 less than 0
df['alb'][df.alb > 7] = np.nan
df['alb'][df.alb < 1] = np.nan

# nulling (t_protein) cells with values greater than 15 less than 1
df['t_protein'][df.t_protein > 15] = np.nan
df['t_protein'][df.t_protein < 1] = np.nan

# nulling (glob) cells with values greater than 10 less than .1
df['glob'][df.glob > 10] = np.nan
df['glob'][df.glob < .1] = np.nan

# nulling (glucose) cells with values greater than 800 less than 1
df['glucose'][df.glucose > 800] = np.nan
df['glucose'][df.glucose < 1] = np.nan

# nulling (glucose.1) cells with values greater than 500 less than 1
df['glucose.1'][df['glucose.1'] > 500] = np.nan
df['glucose.1'][df['glucose.1'] < 1] = np.nan