In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns

# Load reference (training) and current (new) datasets
# Replace 'reference_data.csv' and 'current_data.csv' with your actual file paths
reference_data = pd.read_csv('reference_data.csv')
current_data = pd.read_csv('current_data.csv')

# Identify common features
common_features = list(set(reference_data.columns) & set(current_data.columns))
reference_data = reference_data[common_features]
current_data = current_data[common_features]

# Separate numerical and categorical features
numerical_features = reference_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = list(set(common_features) - set(numerical_features))

# Initialize a dictionary to store test results
drift_results = {}

# Perform KS test for numerical features
for feature in numerical_features:
    ref_values = reference_data[feature].dropna()
    curr_values = current_data[feature].dropna()
    if len(ref_values) > 0 and len(curr_values) > 0:
        statistic, p_value = ks_2samp(ref_values, curr_values)
        drift_results[feature] = {'Test': 'KS', 'Statistic': statistic, 'p-value': p_value}
    else:
        drift_results[feature] = {'Test': 'KS', 'Statistic': np.nan, 'p-value': np.nan}

# Perform Chi-Squared test for categorical features
for feature in categorical_features:
    ref_counts = reference_data[feature].value_counts()
    curr_counts = current_data[feature].value_counts()
    all_categories = set(ref_counts.index).union(set(curr_counts.index))
    ref_freq = [ref_counts.get(cat, 0) for cat in all_categories]
    curr_freq = [curr_counts.get(cat, 0) for cat in all_categories]
    contingency_table = np.array([ref_freq, curr_freq])
    if contingency_table.shape[1] > 1:
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
        drift_results[feature] = {'Test': 'Chi-Squared', 'Statistic': chi2, 'p-value': p_value}
    else:
        drift_results[feature] = {'Test': 'Chi-Squared', 'Statistic': np.nan, 'p-value': np.nan}

# Convert results to DataFrame
drift_df = pd.DataFrame.from_dict(drift_results, orient='index')
drift_df.reset_index(inplace=True)
drift_df.rename(columns={'index': 'Feature'}, inplace=True)

# Display features with significant drift (p-value < 0.05)
drifted_features = drift_df[drift_df['p-value'] < 0.05]
print("Features with significant data drift:")
print(drifted_features)

# Optional: Visualize distributions of drifted features
for feature in drifted_features['Feature']:
    plt.figure(figsize=(10, 5))
    if feature in numerical_features:
        sns.kdeplot(reference_data[feature], label='Reference', shade=True)
        sns.kdeplot(current_data[feature], label='Current', shade=True)
        plt.title(f'Distribution Comparison for Numerical Feature: {feature}')
    else:
        ref_counts = reference_data[feature].value_counts(normalize=True)
        curr_counts = current_data[feature].value_counts(normalize=True)
        categories = list(set(ref_counts.index).union(set(curr_counts.index)))
        ref_freq = [ref_counts.get(cat, 0) for cat in categories]
        curr_freq = [curr_counts.get(cat, 0) for cat in categories]
        x = np.arange(len(categories))
        width = 0.35
        plt.bar(x - width/2, ref_freq, width, label='Reference')
        plt.bar(x + width/2, curr_freq, width, label='Current')
        plt.xticks(x, categories, rotation=45)
        plt.title(f'Distribution Comparison for Categorical Feature: {feature}')
    plt.xlabel(feature)
    plt.ylabel('Density' if feature in numerical_features else 'Proportion')
    plt.legend()
    plt.tight_layout()
    plt.show()