In [15]:
import sys
sys.path.append("..")

import pandas as pd
import matplotlib.pyplot as plt
from src.missing_value_handler import MissingValueHandler
from src.outlier_handler import OutlierHandler
from src.scaler import Scaler
from src.text_cleaner import TextCleaner
from src.feature_engineer import FeatureEngineer
from src.data_type_converter import DataTypeConverter
from src.categorical_encoder import CategoricalEncoder
from src.date_time_handler import DateTimeHandler


# Load sample data
df = pd.read_csv('synthetic_sample_data.csv')


# Function to plot distributions of numeric columns
def plot_numeric_distributions(df, title):
    df.hist(bins=15, figsize=(15, 10))
    plt.suptitle(title)
    plt.show()

# Function to plot outliers
def plot_outliers(df, outliers, column):
    plt.figure(figsize=(10, 6))
    plt.scatter(df.index, df[column], label='Data')
    plt.scatter(outliers.index, outliers[column], color='red', label='Outliers')
    plt.xlabel('Index')
    plt.ylabel(column)
    plt.title(f'Outliers in {column}')
    plt.legend()
    plt.show()

ModuleNotFoundError: No module named 'missing_value_handler'

In [None]:
def test_missing_value_handler(sample_data, missing_value_handler):
    df = sample_data.copy()
    # Assuming handle_missing_values is a method in MissingValueHandler
    df_cleaned = missing_value_handler.handle_missing_values(df)
    assert df_cleaned.isnull().sum().sum() == 0

In [None]:

def test_outlier_handler(sample_data, outlier_handler):
    df = sample_data.copy()
    # Assuming detect_outliers is a method in OutlierHandler
    outliers = outlier_handler.detect_outliers(df, column='some_column')
    assert not outliers.empty
    plot_outliers(df, outliers, 'some_column')


In [None]:
def test_scaler(sample_data, outlier_handler, scaler):
    df = sample_data.copy()
    # Clean outliers first
    df_cleaned_outliers = outlier_handler.handle_outliers(df)
    
    # Assuming standard_scaler is a method in Scaler
    numeric_columns = df_cleaned_outliers.select_dtypes(include='number').columns
    scaler = Scaler(df_cleaned_outliers.copy())
    scaler.standard_scaler(numeric_columns)
    df_cleaned_scaler = scaler.df

    print("\nDataFrame after scaling numeric columns:")
    print(df_cleaned_scaler[numeric_columns])
    plot_numeric_distributions(df_cleaned_scaler, 'Distributions after Scaling')



In [None]:
def test_text_cleaner(sample_data, scaler, text_cleaner):
    df = sample_data.copy()
    # Assuming previous steps were already done
    df_cleaned = scaler.df

    # Clean text for all object type columns that are likely text
    df_cleaned_textcleaner = df_cleaned.copy()
    text_columns = df_cleaned_textcleaner.select_dtypes(include=['object']).columns
    for column in text_columns:
        text_cleaner = TextCleaner(df_cleaned_textcleaner, column)
        df_cleaned_textcleaner = text_cleaner.to_lowercase()
        df_cleaned_textcleaner = text_cleaner.remove_punctuation()
        df_cleaned_textcleaner = text_cleaner.remove_stopwords()
        df_cleaned_textcleaner = text_cleaner.lemmatize()

    print("\nDataFrame after cleaning text columns:")
    print(df_cleaned_textcleaner[text_columns])



In [None]:
def test_feature_engineer(sample_data, scaler, text_cleaner, feature_engineer):
    df = sample_data.copy()
    # Assuming previous steps were already done
    df_cleaned = scaler.df

    # Clean text for all object type columns that are likely text
    df_cleaned_textcleaner = df_cleaned.copy()
    text_columns = df_cleaned_textcleaner.select_dtypes(include=['object']).columns
    for column in text_columns:
        text_cleaner = TextCleaner(df_cleaned_textcleaner, column)
        df_cleaned_textcleaner = text_cleaner.to_lowercase()
        df_cleaned_textcleaner = text_cleaner.remove_punctuation()
        df_cleaned_textcleaner = text_cleaner.remove_stopwords()
        df_cleaned_textcleaner = text_cleaner.lemmatize()

    # Feature engineering - add interaction term for all numeric columns pairs
    df_cleaned_featureengineer = df_cleaned_textcleaner.copy()
    numeric_columns = df_cleaned_featureengineer.select_dtypes(include='number').columns
    feature_engineer = FeatureEngineer(df_cleaned_featureengineer)
    for i, col1 in enumerate(numeric_columns):
        for col2 in numeric_columns[i+1:]:
            interaction_term = f'{col1}_{col2}_interaction'
            df_cleaned_featureengineer = feature_engineer.add_interaction_term(col1, col2, interaction_term)

    print("\nDataFrame after adding interaction terms between numeric columns:")
    print(df_cleaned_featureengineer)
    plot_numeric_distributions(df_cleaned_featureengineer, 'Distributions after Feature Engineering')


In [None]:
# Convert all object columns to categorical and specified columns to numeric
df_cleaned_datatypeconverter = df_cleaned_featureengineer.copy()
data_type_converter = DataTypeConverter(df_cleaned_datatypeconverter)
df_cleaned_datatypeconverter = data_type_converter.to_categorical(text_columns)
df_cleaned_datatypeconverter = data_type_converter.to_numeric(numeric_columns)

print("\nDataFrame after converting text columns to categorical and numeric columns to numeric:")
print(df_cleaned_datatypeconverter.dtypes)

In [None]:
# One-hot encode all categorical columns
df_cleaned_categoricalencoder = df_cleaned_datatypeconverter.copy()
categorical_encoder = CategoricalEncoder(df_cleaned_categoricalencoder)
for column in text_columns:
    df_cleaned_categoricalencoder = categorical_encoder.one_hot_encode(column)

print("\nDataFrame after one-hot encoding categorical columns:")
print(df_cleaned_categoricalencoder)

In [None]:

# Handle datetime columns dynamically
df_cleaned_datetimehandler = df_cleaned_categoricalencoder.copy()
date_columns = df_cleaned_datetimehandler.select_dtypes(include=['object']).columns  # Assuming date columns are initially in object type
datetime_handler = DateTimeHandler(df_cleaned_datetimehandler)
for column in date_columns:
    df_cleaned_datetimehandler = datetime_handler.to_datetime(column)
    df_cleaned_datetimehandler = datetime_handler.extract_date_parts(column)

print("\nDataFrame after handling datetime columns:")
print(df_cleaned_datetimehandler)