<a href="https://colab.research.google.com/github/Jagadeshwarimurugan/medical-appointment-no-shows/blob/main/cleaning_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/noshowappointments.csv.zip")

# ================== Data Cleaning ==================

# 1. Rename columns (lowercase, underscores)
df.columns = df.columns.str.lower().str.replace(" ", "_")

# 2. Convert datetime columns
df["scheduledday"] = pd.to_datetime(df["scheduledday"])
df["appointmentday"] = pd.to_datetime(df["appointmentday"])

# 3. Fix spelling mistake in column
df = df.rename(columns={"handcap": "handicap"})

# 4. Encode target variable 'no-show' -> 'no_show' (No=0, Yes=1)
df["no_show"] = df["no-show"].map({"No": 0, "Yes": 1})
df = df.drop(columns=["no-show"])

# 5. Convert PatientId to string (avoid scientific notation issue)
df["patientid"] = df["patientid"].astype("int64").astype("str")

# 6. Remove duplicate rows
before_dupes = df.shape[0]
df = df.drop_duplicates()
after_dupes = df.shape[0]
dupes_removed = before_dupes - after_dupes

# 7. Remove invalid ages (Age < 0)
invalid_ages = df[df["age"] < 0].shape[0]
df = df[df["age"] >= 0]

# 8. Check for missing values
missing_values = df.isnull().sum().sum()


df.to_csv("noshowappointments_cleaned.csv", index=False)

print("✅ Cleaning Completed")
print(f"Rows removed (duplicates): {dupes_removed}")
print(f"Rows removed (invalid ages): {invalid_ages}")
print(f"Missing values in dataset: {missing_values}")
print("Cleaned dataset saved as 'noshowappointments_cleaned.csv'")


✅ Cleaning Completed
Rows removed (duplicates): 0
Rows removed (invalid ages): 1
Missing values in dataset: 0
Cleaned dataset saved as 'noshowappointments_cleaned.csv'
