Purpose: Analyze and clean csv and produce a new cleaned one + potential dashboard or visualizations

In [3]:
import pandas as pd
df= pd.read_csv('../data_raw/dental_appointments_messy.csv')
df.head()

Unnamed: 0,Patient ID,Patient Name,DOB,Appointment Date,Treatment Code,Treatment Description,Provider,Cost,Insurance Paid,Patient Paid,Status
0,PT001,Jessica Williams,05/12/1985,1/15/2024,D0120,Periodic Oral Evaluation,Dr. Smith,$75.00,$60.00,$15.00,Completed
1,PT001,Jessica Williams,5/12/1985,2/20/2024,D1110,Prophylaxis - Adult,Dr. Smith,125,100.00,$25,Completed
2,PT002,Michael Johnson,11-23-1978,01/08/2024,D0150,Comprehensive Oral Evaluation,Dr. Chen,$125.00,$100.00,$25.00,Completed
3,PT002,Mike Johnson,11/23/1978,1/22/2024,D2391,Resin-based composite - one surface,Dr. Chen,$185.00,148,$37.00,Completed
4,PT003,Sarah Martinez,03/30/1992,1/10/2024,D0120,Periodic Oral Eval,Dr. Smith,75.00,0,$75.00,Completed


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Patient ID             33 non-null     object
 1   Patient Name           33 non-null     object
 2   DOB                    32 non-null     object
 3   Appointment Date       32 non-null     object
 4   Treatment Code         33 non-null     object
 5   Treatment Description  33 non-null     object
 6   Provider               33 non-null     object
 7   Cost                   33 non-null     object
 8   Insurance Paid         29 non-null     object
 9   Patient Paid           33 non-null     object
 10  Status                 33 non-null     object
dtypes: object(11)
memory usage: 3.0+ KB


In [6]:
df.describe(include='all')

Unnamed: 0,Patient ID,Patient Name,DOB,Appointment Date,Treatment Code,Treatment Description,Provider,Cost,Insurance Paid,Patient Paid,Status
count,33,33,32,32,33,33,33,33,29,33,33
unique,16,24,29,32,13,18,3,18,18,22,5
top,PT001,Jessica Williams,05/12/1985,1/15/2024,D0120,Periodic Oral Evaluation,Dr. Smith,$125.00,$100.00,$25.00,Completed
freq,3,3,2,1,10,8,13,7,5,5,28


In [7]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)


In [9]:
print(df.columns)

Index(['patient_id', 'patient_name', 'dob', 'appointment_date',
       'treatment_code', 'treatment_description', 'provider', 'cost',
       'insurance_paid', 'patient_paid', 'status'],
      dtype='object')


In [12]:
df['dob'] = pd.to_datetime(df['dob'], errors='coerce').dt.date

In [13]:
df.head()

Unnamed: 0,patient_id,patient_name,dob,appointment_date,treatment_code,treatment_description,provider,cost,insurance_paid,patient_paid,status
0,PT001,Jessica Williams,1985-05-12,1/15/2024,D0120,Periodic Oral Evaluation,Dr. Smith,$75.00,$60.00,$15.00,Completed
1,PT001,Jessica Williams,1985-05-12,2/20/2024,D1110,Prophylaxis - Adult,Dr. Smith,125,100.00,$25,Completed
2,PT002,Michael Johnson,NaT,01/08/2024,D0150,Comprehensive Oral Evaluation,Dr. Chen,$125.00,$100.00,$25.00,Completed
3,PT002,Mike Johnson,1978-11-23,1/22/2024,D2391,Resin-based composite - one surface,Dr. Chen,$185.00,148,$37.00,Completed
4,PT003,Sarah Martinez,1992-03-30,1/10/2024,D0120,Periodic Oral Eval,Dr. Smith,75.00,0,$75.00,Completed


In [14]:
df['appointment_date'] = pd.to_datetime(df['appointment_date'], errors='coerce').dt.date

In [15]:
df.head()

Unnamed: 0,patient_id,patient_name,dob,appointment_date,treatment_code,treatment_description,provider,cost,insurance_paid,patient_paid,status
0,PT001,Jessica Williams,1985-05-12,2024-01-15,D0120,Periodic Oral Evaluation,Dr. Smith,$75.00,$60.00,$15.00,Completed
1,PT001,Jessica Williams,1985-05-12,2024-02-20,D1110,Prophylaxis - Adult,Dr. Smith,125,100.00,$25,Completed
2,PT002,Michael Johnson,NaT,2024-01-08,D0150,Comprehensive Oral Evaluation,Dr. Chen,$125.00,$100.00,$25.00,Completed
3,PT002,Mike Johnson,1978-11-23,2024-01-22,D2391,Resin-based composite - one surface,Dr. Chen,$185.00,148,$37.00,Completed
4,PT003,Sarah Martinez,1992-03-30,2024-01-10,D0120,Periodic Oral Eval,Dr. Smith,75.00,0,$75.00,Completed


Fixing names

In [16]:
# Create a master patient table with the correct names
# Then always reference by ID and join when needed

# First, identify the "correct" name per patient
patient_master = df.groupby('patient_id').agg({
    'patient_name': lambda x: max(x, key=len),  # Pick longest
    'dob': 'first'  # Assuming DOB is consistent
}).reset_index()

# Then merge back
df = df.drop(['patient_name', 'dob'], axis=1)
df = df.merge(patient_master, on='patient_id', how='left')

In [18]:
df.head()

Unnamed: 0,patient_id,appointment_date,treatment_code,treatment_description,provider,cost,insurance_paid,patient_paid,status,patient_name,dob
0,PT001,2024-01-15,D0120,Periodic Oral Evaluation,Dr. Smith,$75.00,$60.00,$15.00,Completed,Jessica Williams,1985-05-12
1,PT001,2024-02-20,D1110,Prophylaxis - Adult,Dr. Smith,125,100.00,$25,Completed,Jessica Williams,1985-05-12
2,PT002,2024-01-08,D0150,Comprehensive Oral Evaluation,Dr. Chen,$125.00,$100.00,$25.00,Completed,Michael Johnson,1978-11-23
3,PT002,2024-01-22,D2391,Resin-based composite - one surface,Dr. Chen,$185.00,148,$37.00,Completed,Michael Johnson,1978-11-23
4,PT003,2024-01-10,D0120,Periodic Oral Eval,Dr. Smith,75.00,0,$75.00,Completed,Sarah Martinez,1992-03-30


ModuleNotFoundError: No module named 'clean_currency'