## python code below for vs code only

In [12]:
# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# ==============================
# 2. Define the DataCleaner Class
# ==============================
class DataCleaner:
    """
    A reusable class for cleaning datasets in Python.
    Handles missing values, duplicates, outliers, datatype fixes,
    normalization, and encoding.
    """

    def __init__(self, dataframe: pd.DataFrame):
        """Initialize with a pandas DataFrame"""
        self.df = dataframe.copy()

    def inspect(self):
        """Display dataset summary, datatypes, and missing values"""
        print("---- Data Info ----")
        print(self.df.info())
        print("\n---- Missing Values ----")
        print(self.df.isnull().sum())
        print("\n---- Preview ----")
        display(self.df.head())

    def handle_missing(self, strategy="mean", fill_values=None):
        """Handle missing values (mean, median, mode, custom, or drop)"""
        if strategy == "mean":
            self.df = self.df.fillna(self.df.mean(numeric_only=True))
        elif strategy == "median":
            self.df = self.df.fillna(self.df.median(numeric_only=True))
        elif strategy == "mode":
            for col in self.df.columns:
                self.df[col].fillna(self.df[col].mode()[0], inplace=True)
        elif strategy == "custom" and fill_values:
            self.df = self.df.fillna(fill_values)
        else:
            self.df = self.df.dropna()

    def remove_duplicates(self):
        """Remove duplicate rows"""
        before = self.df.shape[0]
        self.df = self.df.drop_duplicates()
        after = self.df.shape[0]
        print(f"Removed {before - after} duplicate rows.")

    def fix_dtypes(self, dtype_dict=None):
        """Convert columns to correct datatypes"""
        if dtype_dict:
            for col, dtype in dtype_dict.items():
                try:
                    if "datetime" in str(dtype):
                        self.df[col] = pd.to_datetime(self.df[col], errors="coerce")
                    else:
                        self.df[col] = self.df[col].astype(dtype)
                except Exception as e:
                    print(f"Could not convert {col}: {e}")

    def handle_outliers(self, cols, method="IQR", z_thresh=3):
        """Remove outliers using IQR or Z-score"""
        for col in cols:
            if method == "IQR":
                Q1 = self.df[col].quantile(0.25)
                Q3 = self.df[col].quantile(0.75)
                IQR = Q3 - Q1
                before = self.df.shape[0]
                self.df = self.df[(self.df[col] >= Q1 - 1.5 * IQR) &
                                  (self.df[col] <= Q3 + 1.5 * IQR)]
                after = self.df.shape[0]
                print(f"{col}: Removed {before - after} outliers (IQR).")
            elif method == "zscore":
                mean = self.df[col].mean()
                std = self.df[col].std()
                before = self.df.shape[0]
                self.df = self.df[(np.abs((self.df[col] - mean) / std) < z_thresh)]
                after = self.df.shape[0]
                print(f"{col}: Removed {before - after} outliers (Z-score).")

    def clean_text(self, cols):
        """Standardize text columns (strip spaces, lowercase)"""
        for col in cols:
            self.df[col] = self.df[col].astype(str).str.strip().str.lower()

    def encode_categoricals(self, cols, method="onehot"):
        """Encode categorical columns (onehot or label encoding)"""
        if method == "onehot":
            self.df = pd.get_dummies(self.df, columns=cols, drop_first=True)
        elif method == "label":
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            for col in cols:
                self.df[col] = le.fit_transform(self.df[col])

    def scale_numeric(self, cols, method="standard"):
        """Scale numeric features"""
        scaler = StandardScaler() if method == "standard" else MinMaxScaler()
        self.df[cols] = scaler.fit_transform(self.df[cols])

    def save(self, filename="cleaned_data.csv"):
        """Save cleaned dataset"""
        self.df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")

    def get_clean_data(self):
        """Return cleaned DataFrame"""
        return self.df

# ============SAMPLE CASE================

### ==============================
### 3. Create a Sample Dataset (Raw Data)
### ==============================
raw_data = {
    "id": [1, 2, 2, 3, 4, 5, 6, None],
    "name": ["Alice ", "Bob", "BOB", "Charlie", None, "Eve", "Frank", "Grace"],
    "age": [25, 30, 30, 35, None, 45, 200, 28],
    "salary": [50000, 60000, 60000, None, 80000, 1200000, 70000, 65000],
    "gender": ["Female", "Male", "Male", "Male", "Female", None, "Male", "Female"],
    "date_joined": ["2021-01-01", "2021-03-15", "2021-03-15", "2021-06-20", 
                    "2021-09-10", "invalid_date", "2022-01-01", "2022-05-01"]
}

df = pd.DataFrame(raw_data)
df.head()
```

---


# ==============================
# 4. Use DataCleaner Step by Step
# ==============================

# Initialize cleaner
cleaner = DataCleaner(df)

# Step 1: Inspect
cleaner.inspect()
```

---


# Step 2: Handle missing values (fill with median for numerics)
cleaner.handle_missing(strategy="median")
cleaner.inspect()
```

---


# Step 3: Remove duplicates
cleaner.remove_duplicates()
cleaner.inspect()
```

---


# Step 4: Fix datatypes (date column to datetime, id to string)
cleaner.fix_dtypes({'date_joined': 'datetime64[ns]', 'id': str})
cleaner.inspect()
```

---


# Step 5: Handle outliers (age, salary columns)
cleaner.handle_outliers(cols=["age", "salary"], method="IQR")
cleaner.inspect()
```

---


# Step 6: Clean text columns (name, gender)
cleaner.clean_text(cols=["name", "gender"])
cleaner.inspect()
```

---


# Step 7: Encode categoricals (gender)
cleaner.encode_categoricals(cols=["gender"], method="onehot")
cleaner.inspect()
```

---


# Step 8: Scale numeric features (age, salary)
cleaner.scale_numeric(cols=["age", "salary"], method="standard")
cleaner.inspect()
```

---


# Step 9: Save cleaned dataset
cleaner.save("client_cleaned_data.csv")

# Get final cleaned DataFrame
final_df = cleaner.get_clean_data()
final_df.head()
```

---

✅ This notebook shows:

* Raw dataset creation
* Cleaning **step-by-step with visible outputs**
* Final clean dataset saved

---

Would you like me to also include **visualizations (like missing data heatmap, outlier boxplots)** in this notebook so you can **show clients before/after cleaning**?