In [1]:
# ========================================================
# 02 - Data Loading & Profiling
# Automatic Data Cleaning & Analysis Agent
# ========================================================

import pandas as pd
import numpy as np
import os

print("Pandas version:", pd.__version__)


Pandas version: 2.2.3


In [2]:
def detect_file_type(path: str) -> str:
    """
    Detect the dataset file type (.csv or .xlsx).
    Returns: "csv" or "excel"
    Raises: ValueError if unsupported.
    """
    ext = os.path.splitext(path)[1].lower()
    
    if ext == ".csv":
        return "csv"
    if ext in [".xlsx", ".xls"]:
        return "excel"
    
    raise ValueError(f"Unsupported file type: {ext}")


In [3]:
def load_dataset(path: str) -> pd.DataFrame:
    """
    Load a CSV or Excel dataset.
    Returns a pandas DataFrame.
    """
    file_type = detect_file_type(path)

    try:
        if file_type == "csv":
            df = pd.read_csv(path)
        else:
            df = pd.read_excel(path)
    except Exception as e:
        raise RuntimeError(f"Failed to load dataset: {e}")

    return df


In [4]:
def profile_dataset(df: pd.DataFrame) -> dict:
    """
    Analyze dataset structure and return a profiling dictionary.
    """
    
    profile = {}

    # Shape
    profile["shape"] = df.shape

    # Columns & data types
    profile["columns"] = list(df.columns)
    profile["dtypes"] = df.dtypes.astype(str).to_dict()

    # Missing values
    profile["missing_values"] = df.isna().sum().to_dict()

    # Duplicate rows
    profile["duplicate_rows"] = int(df.duplicated().sum())

    # Basic stats (for numeric columns only)
    numeric_df = df.select_dtypes(include=np.number)
    if numeric_df.shape[1] > 0:
        profile["numeric_stats"] = numeric_df.describe().to_dict()
    else:
        profile["numeric_stats"] = {}

    return profile


In [5]:
def display_profile(profile: dict):
    """Nicely print profiling information."""
    
    print("=== Dataset Profile ===")
    print(f"Shape: {profile['shape']}")
    print("\nColumns:")
    for col in profile["columns"]:
        print(f"  - {col}  ({profile['dtypes'][col]})")

    print("\nMissing Values:")
    for col, mv in profile["missing_values"].items():
        print(f"  {col}: {mv}")

    print("\nDuplicate Rows:", profile["duplicate_rows"])

    if profile["numeric_stats"]:
        print("\nNumeric Stats:")
        display(pd.DataFrame(profile["numeric_stats"]))
    else:
        print("\n(No numeric columns detected)")


In [9]:
sample_path = "../data/test_clean.csv"   # update with your actual file name

df_raw = load_dataset(sample_path)
profile = profile_dataset(df_raw)

display_profile(profile)


=== Dataset Profile ===
Shape: (20, 4)

Columns:
  - id  (int64)
  - age  (int64)
  - salary  (int64)
  - department  (object)

Missing Values:
  id: 0
  age: 0
  salary: 0
  department: 0

Duplicate Rows: 0

Numeric Stats:


Unnamed: 0,id,age,salary
count,20.0,20.0,20.0
mean,10.5,38.85,4750.45
std,5.91608,10.873798,1411.988947
min,1.0,24.0,2284.0
25%,5.75,30.25,3762.0
50%,10.5,36.5,4918.5
75%,15.25,50.25,5924.5
max,20.0,59.0,6703.0


In [6]:
import os

os.makedirs("../data", exist_ok=True)
print("Folder 'data' ready.")

Folder 'data' ready.


In [7]:
df1 = pd.DataFrame({
    "id": range(1, 21),
    "age": np.random.randint(18, 60, 20),
    "salary": np.random.randint(2000, 7000, 20),
    "department": np.random.choice(["IT", "HR", "Finance"], 20)
})

df1.to_csv("../data/test_clean.csv", index=False)
df1.head()

Unnamed: 0,id,age,salary,department
0,1,26,3995,HR
1,2,26,6268,HR
2,3,34,2284,HR
3,4,38,4957,IT
4,5,34,5661,HR


In [10]:
df2 = pd.DataFrame({
    "id": range(1, 21),
    "age": [25, "26", 27, np.nan, 29, "30", None, 32, "33", 34, 
            35, None, "36", 37, np.nan, 40, "41", 42, None, 44],
    "income": [3000, 3200, None, 4000, 3500, "3600", "", 3800, 3900, None,
               4100, "", "4200", 4300, 4400, None, "4500", 4600, "", 4700],
    "city": ["Paris", "Lyon", None, "Paris", "Marseille", "Paris", "",
             "Lyon", "Paris", None, "Paris", "Tunis", "",
             "Lyon", "Paris", "Dakar", "Paris", None, "Lyon", ""]
})

df2.to_csv("../data/test_messy.csv", index=False)

df3 = pd.DataFrame({
    "id": list(range(1, 21)) + [5, 5],   # duplicate rows
    "temperature": list(np.random.normal(22, 2, 20)) + [80, -15],  # extreme outliers
    "humidity": list(np.random.normal(50, 10, 20)) + [5, 95]  # extreme outliers
})

df3.to_csv("../data/test_outliers_duplicates.csv", index=False)

In [11]:
df_raw = load_dataset("../data/test_messy.csv")
profile = profile_dataset(df_raw)
display_profile(profile)


=== Dataset Profile ===
Shape: (20, 4)

Columns:
  - id  (int64)
  - age  (float64)
  - income  (float64)
  - city  (object)

Missing Values:
  id: 0
  age: 5
  income: 6
  city: 6

Duplicate Rows: 0

Numeric Stats:


Unnamed: 0,id,age,income
count,20.0,15.0,14.0
mean,10.5,34.066667,3985.714286
std,5.91608,5.993647,518.662693
min,1.0,25.0,3000.0
25%,5.75,29.5,3650.0
50%,10.5,34.0,4050.0
75%,15.25,38.5,4375.0
max,20.0,44.0,4700.0


In [12]:
df_raw = load_dataset("../data/test_outliers_duplicates.csv")
profile = profile_dataset(df_raw)
display_profile(profile)


=== Dataset Profile ===
Shape: (22, 3)

Columns:
  - id  (int64)
  - temperature  (float64)
  - humidity  (float64)

Missing Values:
  id: 0
  temperature: 0
  humidity: 0

Duplicate Rows: 0

Numeric Stats:


Unnamed: 0,id,temperature,humidity
count,22.0,22.0,22.0
mean,10.0,22.82964,47.637886
std,5.8554,15.087962,17.729813
min,1.0,-15.0,5.0
25%,5.0,20.879463,38.584947
50%,9.5,22.124403,47.626517
75%,14.75,23.283364,52.822845
max,20.0,80.0,95.0
