In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
import great_expectations as ge

# Load current and reference datasets
reference_data = pd.read_csv('reference_data.csv')
current_data = pd.read_csv('current_data.csv')

# Detect Data Drift using KS Test
drift_results = {}
for col in reference_data.columns:
    stat, p_val = ks_2samp(reference_data[col].dropna(), current_data[col].dropna())
    drift_results[col] = {'p_value': p_val, 'drift_detected': p_val < 0.05}

# Visualize drift
for col in reference_data.columns:
    sns.kdeplot(reference_data[col], label='Reference')
    sns.kdeplot(current_data[col], label='Current')
    plt.title(f'Distribution for {col}')
    plt.legend()
    plt.show()

# Automate Data Quality Checks using Great Expectations
ge_df = ge.from_pandas(current_data)
results = ge_df.expect_table_row_count_to_be_between(min_value=1000, max_value=10000)
print("Row count expectation:", results["success"])
for col in current_data.columns:
    print(ge_df.expect_column_values_to_not_be_null(col)["success"])