"""
# Data Cleaning Notebook

### Objective:
- Load raw dataset (`insurance.csv`)
- Check for missing values, duplicates, and inconsistencies
- Clean and preprocess the data
- Save cleaned dataset as `insurance_cleaned.csv`
"""

In [1]:
import pandas as pd
import os

# Define file path
file_path = os.path.join("..", "data", "insurance.csv")
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
else:
    raise FileNotFoundError(f"File not found: {file_path}")

# Display basic dataset information
print("Dataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check for duplicate records
print("\nDuplicate Records:", df.duplicated().sum())

# Remove duplicate records
df.drop_duplicates(inplace=True)

# Standardize categorical columns
df['sex'] = df['sex'].str.lower().str.strip()
df['smoker'] = df['smoker'].str.lower().str.strip()
df['region'] = df['region'].str.lower().str.strip()

# Save cleaned dataset
cleaned_file_path = os.path.join("..", "data", "insurance_cleaned.csv")
df.to_csv(cleaned_file_path, index=False)

print("\nData Cleaning Completed. Saved as insurance_cleaned.csv")
print(df.head())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None

Missing Values:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Duplicate Records: 1

Data Cleaning Completed. Saved as insurance_cleaned.csv
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.4620