In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp

# -------------------------------
# Step 1: Load the Data
# -------------------------------
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        print("✅ Dataset loaded successfully.")
        return df
    except FileNotFoundError:
        print(f"❌ Error: {file_path} not found.")
        raise

# -------------------------------
# Step 2: Validate the Dataset
# -------------------------------
def validate_dataset(df):
    """Function to validate the dataset for missing values and appropriate types."""
    # Check for missing values
    if df.isnull().sum().any():
        print("⚠️ Warning: There are missing values in the dataset.")
    
    # Check if columns are of expected data types
    expected_dtypes = {'column_name': 'expected_dtype'}  # Example for checking data types
    for col, dtype in expected_dtypes.items():
        if df[col].dtype != dtype:
            print(f"⚠️ Warning: The column '{col}' is not of type '{dtype}'.")

# -------------------------------
# Step 3: Perform KS Test
# -------------------------------
def perform_ks_test(old_data, new_data, column):
    """Function to perform the KS test between two datasets for a given column."""
    result = ks_2samp(old_data[column], new_data[column])
    return result

# -------------------------------
# Step 4: Plotting Function for Distribution
# -------------------------------
def plot_distribution(data, feature_name):
    """Function to plot the distribution of a feature."""
    plt.figure(figsize=(10, 6))
    data[feature_name].plot(kind='hist', bins=30, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {feature_name}')
    plt.xlabel(feature_name)
    plt.ylabel('Frequency')
    plt.show()

# -------------------------------
# Step 5: Visualize the Data (Optional)
# -------------------------------
def ks_test_visualization(df, column):
    """Visualize the distribution of a feature."""
    plt.figure(figsize=(10, 6))
    df[column].plot(kind='line', color='green')
    plt.title(f'{column} over time')
    plt.show()

# -------------------------------
# Main Execution Block
# -------------------------------
if __name__ == "__main__":
    # Step 1: Load the data
    file_path = 'data.csv'  # Path to your dataset
    df = load_data(file_path)

    # Step 2: Validate the dataset
    validate_dataset(df)

    # Step 3: Example KS test (assuming you have two dataframes to compare)
    # Note: Replace these with actual datasets or split your data accordingly.
    old_data = df[df['timestamp'] < '2023-01-01']  # Example split
    new_data = df[df['timestamp'] >= '2023-01-01']  # Example split

    ks_result = perform_ks_test(old_data, new_data, 'column_name')
    print(f"KS Test result: {ks_result}")

    # Step 4: Plot distribution for a feature
    plot_distribution(df, 'column_name')

    # Step 5: Optional visualization
    ks_test_visualization(df, 'column_name')


❌ Error: data.csv not found.


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:
file_path = '/path/to/your/data.csv'
import os
print(os.getcwd())  # This will show the current directory
print(os.listdir())  # This will show a list of files in the current directory
file_path = 'path/to/data.csv'
file_path = 'src/Module 4/Advanced Data Quality & Validation/sample_data.csv'



/workspaces/AI_DATA_ANALYSIS_/src/Module 4/Advanced Data Quality & Validation
['Ques_1.ipynb', 'sample_data.csv', 'Ques_2.ipynb']
