In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns

# Load reference (training) and current (new) datasets
# Replace 'reference_data.csv' and 'current_data.csv' with your actual file paths
reference_data = pd.read_csv('reference_data.csv')
current_data = pd.read_csv('current_data.csv')

# Ensure both datasets have the same features
common_features = list(set(reference_data.columns) & set(current_data.columns))
reference_data = reference_data[common_features]
current_data = current_data[common_features]

# Initialize a dictionary to store KS test results
ks_results = {}

# Perform KS test for each feature
for feature in common_features:
    # Drop missing values
    ref_values = reference_data[feature].dropna()
    curr_values = current_data[feature].dropna()

    # Proceed only if both datasets have sufficient data
    if len(ref_values) > 0 and len(curr_values) > 0:
        # Perform KS test
        statistic, p_value = ks_2samp(ref_values, curr_values)
        ks_results[feature] = {'KS Statistic': statistic, 'p-value': p_value}
    else:
        ks_results[feature] = {'KS Statistic': np.nan, 'p-value': np.nan}

# Convert results to DataFrame for better visualization
ks_df = pd.DataFrame.from_dict(ks_results, orient='index')
ks_df.reset_index(inplace=True)
ks_df.rename(columns={'index': 'Feature'}, inplace=True)

# Display features with significant drift (p-value < 0.05)
drifted_features = ks_df[ks_df['p-value'] < 0.05]
print("Features with significant data drift:")
print(drifted_features)

# Optional: Visualize distributions of drifted features
for feature in drifted_features['Feature']:
    plt.figure(figsize=(10, 5))
    sns.kdeplot(reference_data[feature], label='Reference', shade=True)
    sns.kdeplot(current_data[feature], label='Current', shade=True)
    plt.title(f'Distribution Comparison for Feature: {feature}')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'reference_data.csv'