In [None]:
import pandas as pd
import numpy as np

# Creating a messy dataset on purpose 

data = pd.DataFrame({
    "patient_id" : [101, 102, 103, 104, 105, 106, 107],
    "age" : [34, -5, 45, 130, 29, 60, 50],
    "bmi" : [22.1, 18.5, 55.0, 27.3, 16.0, 45.2, 24.5],
    "visit_cost" : [120, 250, -30, 5000, 90, 200, 150]
})

data

Unnamed: 0,patient_id,age,bmi,visit_cost
0,101,34,22.1,120
1,102,-5,18.5,250
2,103,45,55.0,-30
3,104,130,27.3,5000
4,105,29,16.0,90
5,106,60,45.2,200
6,107,50,24.5,150


In [2]:
# Basic validation

data["valid_age"] = data["age"].between(0, 120)
data["valid_bmi"] = data["bmi"].between(10, 60)
data["valid_visit_cost"] = data["visit_cost"] >= 0

data

Unnamed: 0,patient_id,age,bmi,visit_cost,valid_age,valid_bmi,valid_visit_cost
0,101,34,22.1,120,True,True,True
1,102,-5,18.5,250,False,True,True
2,103,45,55.0,-30,True,True,False
3,104,130,27.3,5000,False,True,True
4,105,29,16.0,90,True,True,True
5,106,60,45.2,200,True,True,True
6,107,50,24.5,150,True,True,True


In [4]:
# Outlier detection using the IQR

Q1 = data["visit_cost"].quantile(0.25)
Q3 = data["visit_cost"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = float(Q1 - (1.5 * IQR))
upper_bound = float(Q3 + (1.5 * IQR))

lower_bound, upper_bound

(-75.0, 405.0)

In [None]:
# Flag outliers

data["outlier_visit_cost"] = (
    (data["visit_cost"] < lower_bound) |
    (data["visit_cost"] > upper_bound)
)

data

Unnamed: 0,patient_id,age,bmi,visit_cost,valid_age,valid_bmi,valid_visit_cost,outlier_visit_cost
0,101,34,22.1,120,True,True,True,False
1,102,-5,18.5,250,False,True,True,False
2,103,45,55.0,-30,True,True,False,False
3,104,130,27.3,5000,False,True,True,True
4,105,29,16.0,90,True,True,True,False
5,106,60,45.2,200,True,True,True,False
6,107,50,24.5,150,True,True,True,False


In [6]:
# Flag Problematic Rows

data["needs_review"] = (
    (~data["valid_age"]) |
    (~data["valid_bmi"]) |
    (~data["valid_visit_cost"]) |
    (data["outlier_visit_cost"])
)

data

Unnamed: 0,patient_id,age,bmi,visit_cost,valid_age,valid_bmi,valid_visit_cost,outlier_visit_cost,needs_review
0,101,34,22.1,120,True,True,True,False,False
1,102,-5,18.5,250,False,True,True,False,True
2,103,45,55.0,-30,True,True,False,False,True
3,104,130,27.3,5000,False,True,True,True,True
4,105,29,16.0,90,True,True,True,False,False
5,106,60,45.2,200,True,True,True,False,False
6,107,50,24.5,150,True,True,True,False,False


In [8]:
# Seperate the data

# Data that is already clean

clean_data = data[~data["needs_review"]]
clean_data

Unnamed: 0,patient_id,age,bmi,visit_cost,valid_age,valid_bmi,valid_visit_cost,outlier_visit_cost,needs_review
0,101,34,22.1,120,True,True,True,False,False
4,105,29,16.0,90,True,True,True,False,False
5,106,60,45.2,200,True,True,True,False,False
6,107,50,24.5,150,True,True,True,False,False


In [9]:
#Data that needs to be reveiewed
review_data = data[data["needs_review"]]
review_data

Unnamed: 0,patient_id,age,bmi,visit_cost,valid_age,valid_bmi,valid_visit_cost,outlier_visit_cost,needs_review
1,102,-5,18.5,250,False,True,True,False,True
2,103,45,55.0,-30,True,True,False,False,True
3,104,130,27.3,5000,False,True,True,True,True
