### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [None]:
# write your code from here
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Load your train and test datasets (example placeholders)
# Replace with your actual dataset loading code
train_df = pd.read_csv('path_to_train.csv')
test_df = pd.read_csv('path_to_test.csv')

# Add label: 0 for train, 1 for test
train_df['is_test'] = 0
test_df['is_test'] = 1

# Combine datasets
combined_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# Drop target variable if exists, so that classifier learns only distribution difference
# Example: If train_df has 'target' column, drop it here
if 'target' in combined_df.columns:
    combined_df = combined_df.drop(columns=['target'])

# Separate features and labels
X = combined_df.drop(columns=['is_test'])
y = combined_df['is_test']

# Handle categorical variables - example: one-hot encode categorical features if any
X = pd.get_dummies(X)

# Split combined data into training and validation for adversarial classifier
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize classifier (you can try other models too)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train classifier
clf.fit(X_train, y_train)

# Predict probabilities on validation set
y_pred_proba = clf.predict_proba(X_val)[:,1]

# Calculate ROC AUC score
auc_score = roc_auc_score(y_val, y_pred_proba)
print(f'Adversarial Validation ROC AUC: {auc_score:.4f}')

# Also print accuracy as an additional metric
y_pred = clf.predict(X_val)
acc_score = accuracy_score(y_val, y_pred)
print(f'Adversarial Validation Accuracy: {acc_score:.4f}')

# Interpretation
if auc_score > 0.7:
    print("Significant data drift detected between train and test datasets.")
else:
    print("No significant data drift detected between train and test datasets.")


