In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
import os

# -------------------------------
# Function: Load data with error handling
# -------------------------------
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"❌ File not found at: {file_path}")
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Successfully loaded: {file_path}")
        return df
    except Exception as e:
        raise IOError(f"❌ Failed to load data from {file_path}: {e}")

# -------------------------------
# Function: Validate dataset structure
# -------------------------------
def validate_dataset(df, required_columns):
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"❌ Missing required column: {col}")
    print("✅ Dataset validation passed.")

# -------------------------------
# Function: Perform KS test
# -------------------------------
def perform_ks_test(df1, df2, feature):
    stat, p_value = ks_2samp(df1[feature], df2[feature])
    print(f"📊 KS Test for '{feature}' — Statistic: {stat:.4f}, p-value: {p_value:.4f}")
    if p_value < 0.05:
        print(f"⚠️ Data drift detected for '{feature}' (p-value < 0.05)\n")
    else:
        print(f"✅ No significant drift detected for '{feature}'\n")
    return stat, p_value

# -------------------------------
# Function: Plot feature distributions
# -------------------------------
def plot_distribution(df1, df2, feature):
    plt.figure(figsize=(10, 6))
    plt.hist(df1[feature], bins=30, alpha=0.6, label="Original", color='blue')
    plt.hist(df2[feature], bins=30, alpha=0.6, label="New", color='orange')
    plt.title(f"Distribution Comparison: {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# -------------------------------
# Main Execution
# -------------------------------
if __name__ == "__main__":
    file_path_original = "src/Module 4/Advanced Data Quality & Validation/saample_data.csv"
    file_path_new = "src/Module 4/Advanced Data Quality & Validation/saample_data.csv"  # Assuming same file for test

    try:
        df_original = load_data(file_path_original)
        df_new = load_data(file_path_new)
    except Exception as err:
        print(err)
        exit(1)

    # Example feature names — replace with actual ones from your dataset
    feature_names = ['Feature1', 'Feature2']

    try:
        validate_dataset(df_original, feature_names)
        validate_dataset(df_new, feature_names)
    except ValueError as ve:
        print(ve)
        exit(1)

    for feature in feature_names:
        perform_ks_test(df_original, df_new, feature)
        plot_distribution(df_original, df_new, feature)


❌ File not found at: src/Module 4/Advanced Data Quality & Validation/saample_data.csv


NameError: name 'df_original' is not defined

In [None]:
file_path = '/full/path/to/your/saample_data.csv'
import os

file_path = 'src/Module 4/Advanced Data Quality & Validation/saample_data.csv'
if os.path.exists(file_path):
    print(f"✅ File found: {file_path}")
else:
    print(f"❌ File not found at: {file_path}")
import os
print(f"Current Working Directory: {os.getcwd()}")


❌ File not found at: src/Module 4/Advanced Data Quality & Validation/saample_data.csv
Current Working Directory: /workspaces/AI_DATA_ANALYSIS_/src/Module 4/Advanced Data Quality & Validation
