# Dataset 2: Anemia Records

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../src/')
import helper_functions as helpers

In [2]:
full = pd.read_csv('../data/Anemia.csv')

anemia = full[full['IDENTIFICATION'] == 'Anemia']
no_anemia = full[full['IDENTIFICATION'] == 'Not Anemia']

male_anemia = anemia[anemia['GENDER'] == 'Male']
female_anemia = anemia[anemia['GENDER'] == 'Female']

male_no_anemia = no_anemia[no_anemia['GENDER'] == 'Male']
female_no_anemia = no_anemia[no_anemia['GENDER'] == 'Female']

In [3]:
full['MINIMUM_EXPECTED'] = full['REFERENCE_INTERVAL'].apply(helpers.find_min)

In [4]:
full['MAXIMUM_EXPECTED'] = full['REFERENCE_INTERVAL'].apply(helpers.find_max)

In [5]:
full.head()

Unnamed: 0,NO,LAB_TEST,RESULT,REFERENCE_INTERVAL,GENDER,IDENTIFICATION,MINIMUM_EXPECTED,MAXIMUM_EXPECTED
0,1,Hemoglobin,14.9,13.5 - 17.5,Male,Not Anemia,13.5,17.5
1,1,MCH,22.7,27.0 - 31.0,Male,Not Anemia,27.0,31.0
2,1,MCHC,29.1,32.0 - 36.0,Male,Not Anemia,32.0,36.0
3,1,MCV,83.7,82.0 - 92.0,Male,Not Anemia,82.0,92.0
4,2,Hemoglobin,15.9,12.0 - 16.0,Female,Not Anemia,12.0,16.0


In [6]:
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5684 entries, 0 to 5683
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   NO                  5684 non-null   int64  
 1   LAB_TEST            5684 non-null   object 
 2   RESULT              5684 non-null   float64
 3   REFERENCE_INTERVAL  5684 non-null   object 
 4   GENDER              5684 non-null   object 
 5   IDENTIFICATION      5684 non-null   object 
 6   MINIMUM_EXPECTED    5684 non-null   float64
 7   MAXIMUM_EXPECTED    5684 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 355.4+ KB


In [7]:
anemia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2480 entries, 8 to 5683
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   NO                  2480 non-null   int64  
 1   LAB_TEST            2480 non-null   object 
 2   RESULT              2480 non-null   float64
 3   REFERENCE_INTERVAL  2480 non-null   object 
 4   GENDER              2480 non-null   object 
 5   IDENTIFICATION      2480 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 135.6+ KB


In [8]:
no_anemia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3204 entries, 0 to 5679
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   NO                  3204 non-null   int64  
 1   LAB_TEST            3204 non-null   object 
 2   RESULT              3204 non-null   float64
 3   REFERENCE_INTERVAL  3204 non-null   object 
 4   GENDER              3204 non-null   object 
 5   IDENTIFICATION      3204 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 175.2+ KB


In [9]:
anemia_group = anemia.groupby('LAB_TEST')
no_anemia_group = no_anemia.groupby('LAB_TEST')

In [10]:
anemia_group['RESULT'].mean()

LAB_TEST
Hemoglobin    11.626290
MCH           22.776290
MCHC          30.327742
MCV           85.298548
Name: RESULT, dtype: float64

In [11]:
no_anemia_group['RESULT'].mean()

LAB_TEST
Hemoglobin    14.795506
MCH           23.005743
MCHC          30.192010
MCV           85.698127
Name: RESULT, dtype: float64

In [12]:
male_anemia_group = male_anemia.groupby('LAB_TEST')
male_no_anemia_group = male_no_anemia.groupby('LAB_TEST')

In [13]:
male_anemia_group['RESULT'].mean()

LAB_TEST
Hemoglobin    11.965777
MCH           23.138350
MCHC          30.303883
MCV           85.230825
Name: RESULT, dtype: float64

In [14]:
male_no_anemia_group['RESULT'].mean()

LAB_TEST
Hemoglobin    15.277134
MCH           22.622256
MCHC          30.229268
MCV           85.048476
Name: RESULT, dtype: float64

In [15]:
female_anemia_group = female_anemia.groupby('LAB_TEST')
female_no_anemia_group = female_no_anemia.groupby('LAB_TEST')

In [16]:
female_anemia_group['RESULT'].mean()

LAB_TEST
Hemoglobin    10.953846
MCH           22.059135
MCHC          30.375000
MCV           85.432692
Name: RESULT, dtype: float64

In [17]:
female_no_anemia_group['RESULT'].mean()

LAB_TEST
Hemoglobin    14.461522
MCH           23.271670
MCHC          30.166173
MCV           86.148626
Name: RESULT, dtype: float64