In [8]:
import sys
from pathlib import Path
from datetime import datetime

# Add parent directory to path to import core modules
# We're in tools/01_csv_cleaner/, need to go up 2 levels to reach core
sys.path.insert(0, str(Path.cwd().parent.parent / "core"))

# Import from core
from base_tool import BaseTool
from utils import check_file_exists, get_file_size, create_directory, clean_string, bytes_to_human_readable
from config import MAX_CSV_SIZE, DEFAULT_ENCODING, OUTPUT_DIR, DEFAULT_MISSING_VALUE

# Import Pandas
import pandas as pd

In [9]:
class CSVCleaner(BaseTool):
    
    def __init__(self):
        super().__init__(name="CSV Cleaner", version="1.0.0")
        self.stats = {
            "original_rows": 0,
            "duplicates_removed": 0,
            "empty_rows_removed": 0,
            "missing_values_filled": 0,
            "final_rows": 0
        }

    def validate_input(self, filepath):
        return True #Stub for now

    def process(self, filepath):
        return {"success": True} #Stub for now

In [10]:
# TEST CELL - Verify class instantiation works

# Create an instance of CSVCleaner
cleaner = CSVCleaner()

# Test that is inherited from BaseTool correctly
info = cleaner.get_info()
print(f"Tool Name: {info['name']}")
print(f"Tool Version: {info['version']}")

# Check that stats dictionary exists
print(f"\nStats initialized: {cleaner.stats}")

Tool Name: CSV Cleaner
Tool Version: 1.0.0

Stats initialized: {'original_rows': 0, 'duplicates_removed': 0, 'empty_rows_removed': 0, 'missing_values_filled': 0, 'final_rows': 0}


In [11]:
def validate_input_full(self, filepath):

    # Check if filepath provided
    if not filepath:
        self.logger.error("No filepath provided")
        return False

    # Convert to Path object
    filepath = Path(filepath)

    # Check if file exists (using utils.py)
    if not check_file_exists(filepath):
        self.logger.error(f"File not found: {filepath}")
        return False

    # Check if CSV file
    if filepath.suffix.lower() != '.csv':
        self.logger.error(f"File must be a CSV file, got: {filepath.suffix}")
        return False

    # Check file size (using utils.py and config.py)
    size = get_file_size(filepath)
    if size is None:
        self.logger.error("Could not determine file size")
        return False

    if size > MAX_CSV_SIZE:
        self.logger.error(f"File too large: {bytes_to_human_readable(size)} (max: {bytes_to_human_readable(MAX_CSV_SIZE)})")
        return False

    self.logger.info(f"Input file validated: {filepath} ({bytes_to_human_readable(size)})")
    return True

# Replace the stub method with the full implementation
CSVCleaner.validate_input = validate_input_full    

In [12]:
# TEST CELL: Test if Validate_input works

# Create a fresh instance
cleaner = CSVCleaner()

# Test validation with the messy test file
result = cleaner.validate_input("ultimate_messy_test.csv")

print(f"\nValidation result: {result}")

[CSV Cleaner] Input file validated: ultimate_messy_test.csv (2.25 KB)

Validation result: True


In [16]:
def process_parsing_test(self, filepath):
    filepath = Path(filepath)

    try:
        # Read CSV file
        self.logger.info(f"Reading CSV file with encoding: {DEFAULT_ENCODING}")

        # Try different parsing approaches
        self.logger.info("Attempting to read CSV...")
        df = pd.read_csv(
            filepath,
            encoding=DEFAULT_ENCODING, 
            dtype=str,
            keep_default_na=False,
            na_filter=False
        )

        # Log what we got
        self.stats["original_rows"] = len(df)
        self.logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
        self.logger.info(f"Column names: {list(df.columns)}")

        # Show first 3 rows to verify data alignment
        print("\nFirst 3 rows of data:")
        print(df.head(3))

        return {
            "success": True,
            "rows": len(df),
            "columns": list(df.columns)
        }
        
    except Exception as e:
        import traceback
        self.logger.error(f"Error type: {type(e).__name__}")
        self.logger.error(f"Error message: {e}")
        self.logger.error(f"Full traceback:")
        traceback.print_exc()
        return None

# Replace the stub process method
CSVCleaner.process = process_parsing_test

In [17]:
# TEST CELL: Test if CSV Parsing works

# Create fresh instance
cleaner = CSVCleaner()

# Run the tool (will validate then process)
result = cleaner.run("ultimate_messy_test.csv")

# Show the result
print("\n" + "="*50)
print("RESULT:")
print(result)

[CSV Cleaner] Starting CSV Cleaner v1.0.0
[CSV Cleaner] Input file validated: ultimate_messy_test.csv (2.25 KB)
[CSV Cleaner] Reading CSV file with encoding: utf-8
[CSV Cleaner] Attempting to read CSV...
[CSV Cleaner] Loaded 22 rows, 11 columns
[CSV Cleaner] Column names: ['Order_ID', 'Customer_Name', 'Product_Name', 'Quantity', 'Price', 'Order_Date', 'Ship_Date', 'Status', 'Email', 'Phone', 'Notes']

First 3 rows of data:
         Order_ID        Customer_Name Product_Name Quantity       Price  \
1001   john smith       Laptop Pro 15"            2       $1      299.99   
1002  SARAH JONES     Wireless Mouse              1   €25.50  01/16/2024   
1003  Mike Wilson  Mechanical Keyboard            1   £79.99  2024-01-17   

       Order_Date   Ship_Date           Status           Email  \
1001   2024-01-15  01/20/2024        completed  JOHN@EMAIL.COM   
1002   2024/01/21   Completed  sarah@email.com                   
1003  17-Jan-2024     pending   Mike@Email.COM        555-5678   

   