In [1]:
import sys, os

# Add the project root directory to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


In [2]:
# tests/test_01_titanic.py
from src.TextPreprocessingToolkit.categorical_preprocessor import CategoricalPreprocessor
import pandas as pd
import os

INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\categorical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_categorical_data"
os.makedirs(INPUT_DIR, exist_ok=True); os.makedirs(OUTPUT_DIR, exist_ok=True)

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df = df[['Pclass', 'Sex', 'Embarked', 'Survived']].dropna().head(500)
df.to_csv(f"{INPUT_DIR}/titanic_raw.csv", index=False)

# Label
label_enc = CategoricalPreprocessor("label", save_dir="../encoders")
label_enc.fit(df, ['Pclass', 'Sex', 'Embarked'])
df_label = label_enc.transform(df, ['Pclass', 'Sex', 'Embarked'])
df_label.to_csv(f"{OUTPUT_DIR}/titanic_label.csv", index=False)

# One-Hot
ohe_enc = CategoricalPreprocessor("onehot", save_dir="../encoders")
ohe_enc.fit(df, ['Pclass', 'Sex', 'Embarked'])
df_ohe = ohe_enc.transform(df, ['Pclass', 'Sex', 'Embarked'])
df_ohe.to_csv(f"{OUTPUT_DIR}/titanic_ohe.csv", index=False)

print("Titanic: Label →", df_label['Sex'].iloc[0], "| OHE →", df_ohe.filter(like='Sex_').columns)

2025-11-07 22:48:44,970 - INFO - Fitted label encoders for 3 columns
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
2025-11-07 22:48:45,001 - INFO - Fitted onehot encoders for 3 columns


Titanic: Label → 1 | OHE → Index(['Sex_female', 'Sex_male'], dtype='object')


In [5]:
# test_03_mushroom.py
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
cols = ["class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor"]
df = pd.read_csv(url, header=None, names=cols[:6]).head(1000)
df.to_csv(f"{INPUT_DIR}/mushroom_raw.csv", index=False)

cp = CategoricalPreprocessor("onehot", save_dir="../encoders")
cp.fit(df, ['cap-shape', 'odor'])
df_clean = cp.transform(df, ['cap-shape', 'odor'])
df_clean.to_csv(f"{OUTPUT_DIR}/mushroom_ohe.csv", index=False)
print("Mushroom: OHE cols →", [c for c in df_clean.columns if "odor" in c])

2025-11-07 22:49:48,257 - INFO - Fitted onehot encoders for 2 columns


Mushroom: OHE cols → ['odor_d', 'odor_g', 'odor_m', 'odor_p', 'odor_u']


In [6]:
# test_04_adult.py
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
cols = ["age", "workclass", "education", "occupation", "income"]
df = pd.read_csv(url, header=None, names=cols).sample(1000)
df.to_csv(f"{INPUT_DIR}/adult_raw.csv", index=False)

cp = CategoricalPreprocessor("label", save_dir="../encoders")
cp.fit(df, ['workclass', 'education'])
df_clean = cp.transform(df, ['workclass', 'education'])
df_clean.to_csv(f"{OUTPUT_DIR}/adult_label.csv", index=False)
print("Adult: workclass →", df_clean['workclass'].iloc[0])

2025-11-07 22:50:06,638 - INFO - Fitted label encoders for 2 columns


Adult: workclass → 0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [7]:
# test_05_car.py
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
cols = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
df = pd.read_csv(url, names=cols).head(800)
df.to_csv(f"{INPUT_DIR}/car_raw.csv", index=False)

cp = CategoricalPreprocessor("onehot", save_dir="../encoders")
cp.fit(df, ['buying', 'safety'])
df_clean = cp.transform(df, ['buying', 'safety'])
df_clean.to_csv(f"{OUTPUT_DIR}/car_ohe.csv", index=False)
print("Car: OHE shape →", df_clean.shape)

2025-11-07 22:50:10,722 - INFO - Fitted onehot encoders for 2 columns


Car: OHE shape → (800, 10)


In [8]:
# test_06_bank.py
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
# Skip download for simplicity — use small sample
data = [
    {"job": "admin", "marital": "married", "y": "yes"},
    {"job": "blue-collar", "marital": "single", "y": "no"}
] * 500
df = pd.DataFrame(data)
df.to_csv(f"{INPUT_DIR}/bank_raw.csv", index=False)

cp = CategoricalPreprocessor("label", save_dir="../encoders")
cp.fit(df, ['job', 'marital'])
df_clean = cp.transform(df, ['job', 'marital'])
df_clean.to_csv(f"{OUTPUT_DIR}/bank_label.csv", index=False)
print("Bank: Done")

2025-11-07 22:50:13,945 - INFO - Fitted label encoders for 2 columns


Bank: Done


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [10]:
# test_08_zoo.py
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data"
cols = ["name", "hair", "feathers", "eggs", "milk", "airborne", "aquatic", "predator", "class"]
df = pd.read_csv(url, names=cols).drop("name", axis=1).head(100)
df.to_csv(f"{INPUT_DIR}/zoo_raw.csv", index=False)

cp = CategoricalPreprocessor("label", save_dir="../encoders")
cp.fit(df, ['class'])
df_clean = cp.transform(df, ['class'])
df_clean.to_csv(f"{OUTPUT_DIR}/zoo_label.csv", index=False)
print("Zoo: class →", df_clean['class'].iloc[0])

2025-11-07 22:50:25,116 - INFO - Fitted label encoders for 1 columns


Zoo: class → 0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [12]:
# test_10_synth.py
import numpy as np
df = pd.DataFrame({
    'color': np.random.choice(['red', 'blue', 'green', np.nan], 1000),
    'size': np.random.choice(['S', 'M', 'L'], 1000),
    'label': np.random.choice(['A', 'B'], 1000)
})
df.to_csv(f"{INPUT_DIR}/synth_cat_raw.csv", index=False)

# Test load/save
cp = CategoricalPreprocessor("label", save_dir="../encoders")
cp.fit(df, ['color', 'size'])
df1 = cp.transform(df, ['color', 'size'])

cp2 = CategoricalPreprocessor("label", save_dir="../encoders")
cp2.load_encoders(['color', 'size'])
df2 = cp2.transform(df, ['color', 'size'])

assert df1.equals(df2)
print("Synthetic: Load/Save passed!")

2025-11-07 22:50:38,395 - INFO - Fitted label encoders for 2 columns
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
2025-11-07 22:50:38,424 - INFO - Loaded 2 label encoders


Synthetic: Load/Save passed!


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
