# TP-1: CleaningData - Applying OOP and Design Patterns

Each exercise builds upon the previous one - so by the end, we will have a complete CSV cleaning data using OOP, decorators, and patterns.


## Setup
Import libraries and load the sample dataset. Run this cell first.

In [1]:
import pandas as pd
from pathlib import Path
DATA_PATH = Path(r'/Users/visal/Documents/ITC-AMS/Semester I 2025-2026/Advance Programming for Data Science/TP/Week-1/Code/sample_data.csv')
print('Dataset exists:', DATA_PATH.exists())
df = pd.read_csv(DATA_PATH)
df.head()

Dataset exists: True


Unnamed: 0,id,name,age,height_cm,weight_kg,city,score
0,1,Alice,29.0,165.0,68.0,New York,85.0
1,2,Bob,,172.0,,Los Angeles,90.0
2,3,Charlie,35.0,168.0,72.0,Chicago,
3,4,David,,,80.0,Houston,75.0
4,5,Eva,27.0,160.0,55.0,New York,88.0


----
## Exercise 1 - `CSVReader` class (OOP foundation)

**Goal:** Build a `CSVReader` that encapsulates reading and previewing a CSV file.

### Starter code
Fill in the `TODO` parts.

In [None]:
import pandas as pd

class CSVReader:
    def __init__(self, file_path: str):
        # TODO: store the path and initialize internal state
        self.file_path = file_path
        self.data = None

    def read(self) -> pd.DataFrame:
        """Read CSV into a pandas DataFrame and store it in self.data"""
        # TODO: implement reading
        self.data = pd.read_csv(self.file_path)
        return self.data

    def preview(self, n=5):
        # TODO: implement preview
        if self.data is None:
            print('No data loaded. Call .read() first.')
        else:
            display(self.data.head(n))

# Your turn: instantiate and call
reader = CSVReader(str(DATA_PATH))
df = reader.read()
reader.preview(5)

----
## Exercise 2 - Strategy Pattern for Missing Value Handling

**Goal:** Implement interchangeable missing-value strategies and a `DataCleaner` that uses them.

### Starter code
Fill the `TODO` parts. Instructor solutions are commented below each class.

In [None]:
from abc import ABC, abstractmethod
import pandas as pd

class MissingValueStrategy(ABC):
    @abstractmethod
    def handle(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

class DropMissing(MissingValueStrategy):
    def handle(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: drop rows with any missing values
        return df.dropna()

class FillMean(MissingValueStrategy):
    def handle(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: fill numeric NaNs with column mean
        return df.fillna(df.mean(numeric_only=True))

class FillMode(MissingValueStrategy):
    def handle(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: fill NaNs with mode for each column (if mode exists)
        df_copy = df.copy()
        for col in df_copy.columns:
            try:
                mode_val = df_copy[col].mode(dropna=True)
                if not mode_val.empty:
                    df_copy[col].fillna(mode_val.iloc[0], inplace=True)
            except Exception:
                pass
        return df_copy

class DataCleaner:
    def __init__(self, strategy: MissingValueStrategy):
        self.strategy = strategy

    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
        return self.strategy.handle(df)

# Your turn: test different strategies
cleaner_mean = DataCleaner(FillMean())
df_mean = cleaner_mean.clean(df)
df_mean.head()

----
## Exercise 3 - Decorators for Logging and Timing

**Goal:** Implement simple decorators to log actions and execution time and apply them to methods.


In [None]:
import time
from functools import wraps

def log_action(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        print(f"[LOG] Starting {func.__name__}()")
        result = func(*args, **kwargs)
        print(f"[LOG] Finished {func.__name__}()")
        return result
    return wrapper

def log_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        duration = time.time() - start
        print(f"[TIME] {func.__name__} executed in {duration:.4f}s")
        return result
    return wrapper

# Apply to CSVReader.read (example)
class CSVReaderWithLogging(CSVReader):
    @log_time
    @log_action
    def read(self):
        return super().read()

reader_logged = CSVReaderWithLogging(str(DATA_PATH))
df_logged = reader_logged.read()


----
## Exercise 4 - Factory Pattern for Transformations

### Goal:
Provide a way to obtain transformation objects via a factory.

In [None]:
from abc import ABC, abstractmethod

class DataTransform(ABC):
    @abstractmethod
    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

class NormalizeColumns(DataTransform):
    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
        df2 = df.copy()
        df2.columns = [c.strip().lower().replace(' ', '_') for c in df2.columns]
        return df2

class RemoveDuplicates(DataTransform):
    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.drop_duplicates()

class StandardizeText(DataTransform):
    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
        df2 = df.copy()
        for col in df2.select_dtypes(include='object').columns:
            df2[col] = df2[col].str.strip().str.lower()
        return df2

class TransformFactory:
    def get_transform(self, name: str) -> DataTransform:
        name = name.lower()
        if name == 'normalize':
            return NormalizeColumns()
        elif name == 'remove_duplicates':
            return RemoveDuplicates()
        elif name == 'standardize':
            return StandardizeText()
        else:
            raise ValueError(f'Unknown transform: {name}')

# Test factory
factory = TransformFactory()
t_norm = factory.get_transform('normalize')
df_norm = t_norm.apply(df)
df_norm.head()

----
## Exercise 5 - Template Method: Full Pipeline

### Goal: 
Combine previous components into a pipeline class that defines the workflow skeleton and allows specialization.


In [None]:
class DataPipeline(ABC):
    def run(self):
        df = self.load()
        df = self.clean(df)
        df = self.transform(df)
        self.save(df)

    @abstractmethod
    def load(self):
        pass

    @abstractmethod
    def clean(self, df):
        pass

    @abstractmethod
    def transform(self, df):
        pass

    @abstractmethod
    def save(self, df):
        pass

class CSVDataPipeline(DataPipeline):
    def __init__(self, input_path: str, output_path: str):
        self.input_path = input_path
        self.output_path = output_path
        self.reader = CSVReaderWithLogging(input_path)
        self.cleaner = DataCleaner(FillMean())
        self.factory = TransformFactory()

    def load(self):
        return self.reader.read()

    def clean(self, df):
        return self.cleaner.clean(df)

    def transform(self, df):
        # apply normalization and standardization
        df = self.factory.get_transform('normalize').apply(df)
        df = self.factory.get_transform('standardize').apply(df)
        df = self.factory.get_transform('remove_duplicates').apply(df)
        return df

    def save(self, df):
        df.to_csv(self.output_path, index=False)
        print(f'Saved cleaned data to {self.output_path}')

# Run the pipeline
pipeline = CSVDataPipeline(str(DATA_PATH), 'data/cleaned_sample.csv')
pipeline.run()
pd.read_csv('data/cleaned_sample.csv').head()