In [1]:
import sys, os

# Add the project root directory to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


In [None]:
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Download
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame.sample(1000, random_state=42)
df.to_csv(f"{INPUT_DIR}/housing_raw.csv", index=False)

# Process
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df, columns=['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup'],
    impute="median", scale="standard", remove_outliers="iqr"
)
df_clean.to_csv(f"{OUTPUT_DIR}/housing_clean.csv", index=False)

# Plot
plt.figure(figsize=(10,4))
plt.subplot(1,2,1); sns.boxplot(data=df[['MedInc']]); plt.title("Before")
plt.subplot(1,2,2); sns.boxplot(data=df_clean[['MedInc']]); plt.title("After")
plt.savefig(f"{OUTPUT_DIR}/housing_plot.png")
plt.close()

print("Housing: Done")
print(np_prep.report)

2025-11-07 20:14:42,873 - INFO - Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\Gaurav\scikit_learn_data
2025-11-07 20:14:46,124 - INFO - Fitted NumericalPreprocessor on 5 columns


Housing: Done
{'numerical': {'MedInc': {'outliers_removed': 44}, 'AveRooms': {'outliers_removed': 25}, 'Population': {'outliers_removed': 58}, 'AveOccup': {'outliers_removed': 28}}}


In [5]:
# test_02_boston.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
from sklearn.datasets import fetch_california_housing
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define directories (ensure they exist)
INPUT_DIR =  r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load California housing dataset as replacement for Boston
california = fetch_california_housing(as_frame=True)
df = california.frame.sample(300, random_state=42)
df.to_csv(f"{INPUT_DIR}/california_raw.csv", index=False)

# Numerical preprocessing
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population'],
    remove_outliers="zscore"
)

# Save cleaned data
df_clean.to_csv(f"{OUTPUT_DIR}/california_clean.csv", index=False)

# Plot example comparison
plt.boxplot([df['MedInc'], df_clean['MedInc']], labels=['Raw', 'Clean'])
plt.title("California Housing - MedInc Before vs After Cleaning")
plt.ylabel("Median Income")
plt.savefig(f"{OUTPUT_DIR}/california_plot.png")
plt.close()

print("California Housing: Done ✅")


2025-11-07 20:18:09,938 - INFO - Fitted NumericalPreprocessor on 5 columns


California Housing: Done ✅


  plt.boxplot([df['MedInc'], df_clean['MedInc']], labels=['Raw', 'Clean'])


In [8]:
# test_03_diabetes.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
from sklearn.datasets import load_diabetes
import pandas as pd
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load diabetes dataset
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=[f"f{i}" for i in range(10)]).sample(200, random_state=42)

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/diabetes_raw.csv", index=False)

# Apply preprocessing (convert Index → list)
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(df, columns=list(df.columns), scale="minmax")

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/diabetes_clean.csv", index=False)

print("Diabetes: Done ✅")


2025-11-07 20:21:53,262 - INFO - Fitted NumericalPreprocessor on 10 columns


Diabetes: Done ✅


In [None]:
# test_04_wine.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
import pandas as pd
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Wine Quality (Red) dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=";").sample(500, random_state=42)

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/wine_raw.csv", index=False)

# Apply preprocessing
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=list(df.columns[:-1]),   # Exclude target column "quality"
    impute="mean",
    remove_outliers="iqr"
)

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/wine_clean.csv", index=False)

print("Wine: Done ✅")



2025-11-07 20:24:03,582 - INFO - Fitted NumericalPreprocessor on 11 columns


Wine: Done ✅
Housing: Done
{'numerical': {'fixed acidity': {'outliers_removed': 9}, 'volatile acidity': {'outliers_removed': 7}, 'residual sugar': {'outliers_removed': 44}, 'chlorides': {'outliers_removed': 28}, 'free sulfur dioxide': {'outliers_removed': 14}, 'total sulfur dioxide': {'outliers_removed': 18}, 'density': {'outliers_removed': 15}, 'pH': {'outliers_removed': 9}, 'sulphates': {'outliers_removed': 16}, 'alcohol': {'outliers_removed': 3}}}


In [11]:
# test_05_mpg.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Auto MPG dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
cols = [
    "mpg", "cylinders", "displacement", "horsepower",
    "weight", "acceleration", "year", "origin", "name"
]
df = (
    pd.read_fwf(url, names=cols, na_values="?")
    .drop("name", axis=1)
    .dropna()
    .sample(200, random_state=42)
)

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/mpg_raw.csv", index=False)

# Apply preprocessing
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=["displacement", "horsepower", "weight"],
    scale="standard"
)

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/mpg_clean.csv", index=False)

# --- Plot before vs after scaling ---
plt.figure(figsize=(6, 4))
plt.boxplot(
    [df["horsepower"], df_clean["horsepower"]],
    labels=["Raw", "Scaled"],
    patch_artist=True
)
plt.title("Horsepower: Before vs After Standard Scaling")
plt.ylabel("Value")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/mpg_plot.png")
plt.close()

print("MPG: Done ✅ (plot saved)")


2025-11-07 20:26:14,286 - INFO - Fitted NumericalPreprocessor on 3 columns


MPG: Done ✅ (plot saved)


  plt.boxplot(


In [12]:
# test_06_heart.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Heart Disease dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
cols = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]
df = pd.read_csv(url, names=cols, na_values="?").dropna().sample(200, random_state=42)

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/heart_raw.csv", index=False)

# Apply preprocessing
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=["age", "trestbps", "chol", "thalach"],
    remove_outliers="iqr"
)

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/heart_clean.csv", index=False)

# --- Plot before vs after cleaning (example: "chol") ---
plt.figure(figsize=(6, 4))
plt.boxplot(
    [df["chol"], df_clean["chol"]],
    labels=["Raw", "Cleaned"],
    patch_artist=True
)
plt.title("Cholesterol: Before vs After Outlier Removal (IQR)")
plt.ylabel("Cholesterol Level")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/heart_plot.png")
plt.close()

print("Heart: Done ✅ (plot saved)")


2025-11-07 20:27:01,942 - INFO - Fitted NumericalPreprocessor on 4 columns


Heart: Done ✅ (plot saved)


  plt.boxplot(


In [13]:
# test_07_iris.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/iris_raw.csv", index=False)

# Apply preprocessing
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=list(df.columns),
    scale="minmax"
)

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/iris_clean.csv", index=False)

# --- Plot before vs after scaling (example: sepal length) ---
plt.figure(figsize=(6, 4))
plt.boxplot(
    [df["sepal length (cm)"], df_clean["sepal length (cm)"]],
    labels=["Raw", "Scaled"],
    patch_artist=True
)
plt.title("Sepal Length: Before vs After Min-Max Scaling")
plt.ylabel("Value")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/iris_plot.png")
plt.close()

print("Iris: Done ✅ (plot saved)")


2025-11-07 20:31:29,217 - INFO - Fitted NumericalPreprocessor on 4 columns


Iris: Done ✅ (plot saved)


  plt.boxplot(


In [2]:
    # test_08_tips.py
    from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
    import seaborn as sns
    import pandas as pd
    import matplotlib.pyplot as plt
    import os

    # Define directories (ensure they exist)
    INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
    OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
    os.makedirs(INPUT_DIR, exist_ok=True)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Load Tips dataset
    df = sns.load_dataset("tips").sample(100, random_state=42)

    # Save raw dataset
    df.to_csv(f"{INPUT_DIR}/tips_raw.csv", index=False)

    # Apply numerical preprocessing
    np_prep = NumericalPreprocessor()
    df_clean = np_prep.fit_transform(
        df,
        columns=["total_bill", "tip"],
        impute="median"
    )

    # Save cleaned dataset
    df_clean.to_csv(f"{OUTPUT_DIR}/tips_clean.csv", index=False)

    # --- Plot before vs after cleaning (example: total_bill) ---
    plt.figure(figsize=(6, 4))
    plt.boxplot(
        [df["total_bill"], df_clean["total_bill"]],
        labels=["Raw", "Cleaned"],
        patch_artist=True
    )
    plt.title("Total Bill: Before vs After Median Imputation")
    plt.ylabel("Value")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/tips_plot.png")
    plt.close()

    print("Tips: Done ✅ (plot saved)")


2025-11-07 21:11:30,394 - INFO - NumericalPreprocessor fitted on 2 columns (impute=median)
  plt.boxplot(


Tips: Done ✅ (plot saved)


In [2]:
# test_09_titanic.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Select relevant columns and clean
df = df[['Age', 'Fare', 'Pclass', 'Survived']].dropna().sample(500, random_state=42)

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/titanic_raw.csv", index=False)

# Apply preprocessing
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=['Age', 'Fare'],
    remove_outliers="iqr",
    scale="standard"
)

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/titanic_clean.csv", index=False)

# --- Plot example: Age before vs after cleaning ---
plt.figure(figsize=(6, 4))
plt.boxplot(
    [df['Age'], df_clean['Age']],
    labels=["Raw", "Cleaned"],
    patch_artist=True
)
plt.title("Titanic Age: Before vs After IQR Outlier Removal + Scaling")
plt.ylabel("Age (Standardized)")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/titanic_plot.png")
plt.close()

print("Titanic: Done ✅ (plot saved)")


ModuleNotFoundError: No module named 'src'

In [19]:
# test_10_synthetic.py
from src.TextPreprocessingToolkit.numerical_preprocessor import NumericalPreprocessor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Define directories (ensure they exist)
INPUT_DIR = r"E:\Text-Preprocessing-Toolkit\data\numerical_data"
OUTPUT_DIR = r"E:\Text-Preprocessing-Toolkit\cleaned_data\cleaned_numerical_data"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Generate synthetic dataset ---
np.random.seed(42)
df = pd.DataFrame({
    'income': np.random.lognormal(10, 1, 1000),
    'age': np.random.randint(18, 80, 1000),
    'score': np.random.beta(2, 5, 1000) * 100
})

# Add some missing values
df.loc[np.random.choice(df.index, 50), 'income'] = np.nan

# Save raw dataset
df.to_csv(f"{INPUT_DIR}/synth_raw.csv", index=False)

# --- Apply preprocessing ---
np_prep = NumericalPreprocessor()
df_clean = np_prep.fit_transform(
    df,
    columns=list(df.columns),
    impute="median",
    remove_outliers="zscore"
)

# Save cleaned dataset
df_clean.to_csv(f"{OUTPUT_DIR}/synth_clean.csv", index=False)

# --- Plot: income before vs after cleaning ---
plt.figure(figsize=(6, 4))
plt.boxplot(
    [df['income'].dropna(), df_clean['income']],
    labels=["Raw", "Cleaned"],
    patch_artist=True
)
plt.title("Synthetic: Income Before vs After Z-Score Outlier Removal")
plt.ylabel("Income")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/synth_plot.png")
plt.close()

# --- Print summary report ---
print("Synthetic: Done ✅ (plot saved)")
print(np_prep.report)


2025-11-07 20:40:03,283 - INFO - Fitted NumericalPreprocessor on 3 columns
  plt.boxplot(


Synthetic: Done ✅ (plot saved)
{'numerical': {'income': {'imputed_with': 22617.710381362493, 'outliers_removed': 14}, 'score': {'outliers_removed': 6}}}
