In [3]:
# Model 1 

# Define time periods
initial_start = pd.Timestamp('2015-01-01')
initial_end = pd.Timestamp('2015-12-31')
training_end = pd.Timestamp('2017-11-30')
eval_end = pd.Timestamp('2018-01-31')

# -------------- Model 1: Data Preparation --------------
# Get initial training data (2015)
initial_data = df[df['order_date'] <= initial_end]
X_initial = initial_data.drop(columns=['fraud', 'order_date'])
y_initial = initial_data['fraud']

# Split and scale initial data
X_train, X_val, y_train, y_val = train_test_split(
    X_initial, y_initial, 
    test_size=0.2, 
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# -------------- Model 1: Architecture --------------
def create_model(learning_rate=0.001):
    model = keras.Sequential([
        keras.layers.Input(shape=(39,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy',
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tf.keras.metrics.AUC(curve='PR', name='auc_pr')]
    )
    return model

# -------------- Model 1: Training --------------
print(f"\nTraining Model 1 (Static) with data up to {initial_end.strftime('%Y-%m-%d')}...")

model1 = create_model()
history1 = model1.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=10,
    batch_size=512,
    verbose=1
)

# Store training metrics
model1_train_results = pd.DataFrame({
    'epoch': range(1, len(history1.history['loss']) + 1),
    'loss': history1.history['loss'],
    'val_loss': history1.history['val_loss'],
    'accuracy': history1.history['accuracy'],
    'val_accuracy': history1.history['val_accuracy'],
    'auc_pr': history1.history['auc_pr'],
    'val_auc_pr': history1.history['val_auc_pr']
})

# -------------- Model 1: Testing on Rolling Windows --------------
# Initialize results storage
model1_results = []

# Test window parameters
last_valid_start = pd.Timestamp('2017-12-01')
current_date = pd.Timestamp('2016-02-01')

# Evaluate on rolling windows
while current_date <= last_valid_start:
    # Define test window
    test_start = current_date
    test_end_month = current_date.month + 1
    test_end_year = current_date.year
    if test_end_month > 12:
        test_end_month -= 12
        test_end_year += 1
    
    last_day = calendar.monthrange(test_end_year, test_end_month)[1]
    test_end = pd.Timestamp(f"{test_end_year}-{test_end_month:02d}-{last_day:02d}")
    
    # Get test data
    test_mask = (df['order_date'] > test_start) & (df['order_date'] <= test_end)
    test_data = df[test_mask]
    
    if len(test_data) > 0:
        # Prepare test data
        X_test = test_data.drop(columns=['fraud', 'order_date'])
        y_test = test_data['fraud']
        X_test_scaled = scaler.transform(X_test)
        
        # Get predictions
        y_pred_proba = model1.predict(X_test_scaled, verbose=0)
        threshold = np.quantile(y_pred_proba, 0.99)  # Top 1%
        y_pred = (y_pred_proba >= threshold).astype(int)
        print(f"Threshold value for this window: {threshold}")
        
        # Calculate metrics
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precisions, recalls, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recalls, precisions)
        
        # Store results
        model1_results.append({
            'date': current_date,
            'model_name': 'Static',  # Changed from 'model' to 'model_name' for consistency
            'window': f"{test_start.strftime('%Y-%m')} to {test_end.strftime('%Y-%m')}",
            'PR-AUC': pr_auc,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision,
            'Recall': recall
        })
    
    current_date += pd.DateOffset(months=1)

# Convert results to DataFrame
model1_test_results = pd.DataFrame(model1_results)

# Store final results for comparison
model1_final_results = {
    'model_name': 'Static',
    'train_metrics': model1_train_results,
    'test_metrics': model1_test_results
}

# Save results for later comparison
np.save('model1_results.npy', model1_final_results, allow_pickle=True)

# Print summary metrics
print("\nModel 1 (Static) Test Metrics:")
print(model1_test_results[['window', 'PR-AUC', 'Accuracy', 'Precision', 'Recall']])
print("\nAverage Test Metrics:")
print(model1_test_results[['PR-AUC', 'Accuracy', 'Precision', 'Recall']].mean())



# -------------- Model 1: Plot Training Metrics --------------
plt.figure(figsize=(15, 5))
metrics_to_plot = {
    'Loss': ['loss', 'val_loss'],
    'Accuracy': ['accuracy', 'val_accuracy'],
    'AUC-PR': ['auc_pr', 'val_auc_pr']
}

for i, (metric_name, metric_keys) in enumerate(metrics_to_plot.items()):
    plt.subplot(1, 3, i + 1)
    plt.plot(history1.history[metric_keys[0]], label='Train')
    plt.plot(history1.history[metric_keys[1]], label='Validation')
    plt.title(f'Model 1 - {metric_name} over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend()
plt.tight_layout()
plt.show()


# -------------- Model 1: Plot Test Metrics --------------
plt.figure(figsize=(15, 10))
metrics = ['PR-AUC', 'Accuracy', 'Precision', 'Recall']

for i, metric in enumerate(metrics):
    plt.subplot(2, 2, i + 1)
    plt.plot(range(len(model1_test_results)), model1_test_results[metric], label='Static Model', color='blue')
    plt.title(f'{metric} Over Test Windows')
    plt.xlabel('Test Window')
    plt.ylabel(metric)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    # Use window labels for x-axis
    plt.xticks(range(len(model1_test_results)), 
               model1_test_results['window'], 
               rotation=90,
               ha='right')

plt.tight_layout()
plt.show()






# Model 2

# Define time periods
initial_start = pd.Timestamp('2015-01-01')
initial_end = pd.Timestamp('2015-12-31')
training_end = pd.Timestamp('2017-11-30')
eval_end = pd.Timestamp('2018-01-31')

def create_model(learning_rate=0.001):
    model = keras.Sequential([
        keras.layers.Input(shape=(39,)),
        keras.layers.Dense(64, activation='relu', name='dense_1'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation='relu', name='dense_2'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(16, activation='relu', name='dense_3'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation='sigmoid', name='output')
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy',
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tf.keras.metrics.AUC(curve='PR', name='auc_pr')]
    )
    return model

# -------------- Model 2: Initial Training (2015) --------------
print("\nStarting Model 2: Initial Training on 2015 data...")

# Initialize storage
models = {}
#scalers = {}
model2_train_results = []
model2_results = []

# Get 2015 data
initial_mask = (df['order_date'] >= initial_start) & (df['order_date'] <= initial_end)
initial_data = df[initial_mask]

# Prepare 2015 data
X_initial = initial_data.drop(columns=['fraud', 'order_date'])
y_initial = initial_data['fraud']

# Split 2015 data
X_train, X_val, y_train, y_val = train_test_split(
    X_initial, y_initial,
    test_size=0.2,
    random_state=42
)

# Scale 2015 data
initial_scaler = StandardScaler()
X_train_scaled = initial_scaler.fit_transform(X_train)
X_val_scaled = initial_scaler.transform(X_val)

# Train 2015 model
initial_model = create_model(learning_rate=0.001)
initial_history = initial_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=10,
    batch_size=512,
    verbose=1
)

# Store 2015 model and scaler
models[initial_end] = initial_model
#scalers[initial_end] = initial_scaler
previous_weights = initial_model.get_weights()

# Store 2015 training metrics
model2_train_results.append({
    'date': initial_end,
    'train_loss': initial_history.history['loss'][-1],
    'val_loss': initial_history.history['val_loss'][-1],
    'train_accuracy': initial_history.history['accuracy'][-1],
    'val_accuracy': initial_history.history['val_accuracy'][-1],
    'train_auc_pr': initial_history.history['auc_pr'][-1],
    'val_auc_pr': initial_history.history['val_auc_pr'][-1]
})

# -------------- Model 2: Monthly Updates (2016-2017) --------------
print("\nStarting Model 2: Monthly Updates...")

current_date = pd.Timestamp('2016-01-31')  # First update month
last_train_date = pd.Timestamp('2017-11-30')  # Last update month

while current_date <= last_train_date:
    print(f"\nProcessing month ending {current_date.strftime('%Y-%m-%d')}...")
    
    # Get current month's training data
    month_start = current_date.replace(day=1)  # First day of current month
    month_end = (month_start + pd.DateOffset(months=1) - pd.DateOffset(days=1))  # Last day of current month
    
    train_mask = (df['order_date'] > month_start) & (df['order_date'] <= month_end)
    train_data = df[train_mask]
    print(f"Training data period: {month_start.strftime('%Y-%m-%d')} to {month_end.strftime('%Y-%m-%d')}")
    print(f"Training data size: {len(train_data)}")
    
    # Prepare current month's training data
    X_train = train_data.drop(columns=['fraud', 'order_date'])
    y_train = train_data['fraud']
    
    # Split current month's data
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train,
        test_size=0.2,
        random_state=42
    )
    
    # Scale current month's data
    X_train_scaled = initial_scaler.transform(X_train)
    X_val_scaled = initial_scaler.transform(X_val)
    
    # Create new model with previous weights
    current_model = create_model(learning_rate=0.0001)
    current_model.set_weights(previous_weights)
    
    # Train on current month
    history = current_model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=3,
        batch_size=256,
        verbose=1
    )
    
    # Store model, scaler, and weights
    models[current_date] = current_model
    #scalers[current_date] = current_scaler
    previous_weights = current_model.get_weights()
    print(f"Storing weights from {current_date.strftime('%Y-%m')} model for next month's training")
    
    # Store training metrics
    model2_train_results.append({
        'date': current_date,
        'train_loss': history.history['loss'][-1],
        'val_loss': history.history['val_loss'][-1],
        'train_accuracy': history.history['accuracy'][-1],
        'val_accuracy': history.history['val_accuracy'][-1],
        'train_auc_pr': history.history['auc_pr'][-1],
        'val_auc_pr': history.history['val_auc_pr'][-1]
    })
    
    # -------------- Testing on Next Two Months --------------
    # Define test window
    test_start = current_date + pd.DateOffset(days=1)
    test_end = test_start + pd.DateOffset(months=2) - pd.DateOffset(days=1)
    
    test_mask = (df['order_date'] >= test_start) & (df['order_date'] <= test_end)
    test_data = df[test_mask]
    
    print(f"Testing period: {test_start.strftime('%Y-%m-%d')} to {test_end.strftime('%Y-%m-%d')}")
    print(f"Test data size: {len(test_data)}")
    
    if len(test_data) > 0:
        # Prepare and scale test data using current month's scaler
        X_test = test_data.drop(columns=['fraud', 'order_date'])
        y_test = test_data['fraud']
        X_test_scaled = initial_scaler.transform(X_test)
        
        # Get predictions
        y_pred_proba = current_model.predict(X_test_scaled, verbose=0)
        # With:
        threshold = np.quantile(y_pred_proba, 0.99)  # Top 1%
        y_pred = (y_pred_proba >= threshold).astype(int)
        print(f"Threshold value for this window: {threshold}")
        
        # Calculate metrics
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precisions, recalls, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recalls, precisions)
        
        # Store results
        model2_results.append({
            'date': current_date,
            'model': 'Incremental',
            'window': f"{test_start.strftime('%Y-%m')} to {test_end.strftime('%Y-%m')}",
            'PR-AUC': pr_auc,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision,
            'Recall': recall,
            'threshold': threshold
        })
    
    # Move to next month
    current_date = (current_date + pd.DateOffset(months=1))

# -------------- Results Analysis --------------
# Convert results to DataFrames
model2_train_results_df = pd.DataFrame(model2_train_results)
model2_results_df = pd.DataFrame(model2_results)

print("\nTraining Metrics:")
print(model2_train_results_df)

print("\nTest Metrics for each window:")
print(model2_results_df[['window', 'PR-AUC', 'Accuracy', 'Precision', 'Recall']])

print("\nAverage Test Metrics:")
print(model2_results_df[['PR-AUC', 'Accuracy', 'Precision', 'Recall']].mean())

# Plot metrics over time
plt.figure(figsize=(15, 10))
metrics = ['PR-AUC', 'Accuracy', 'Precision', 'Recall']
for i, metric in enumerate(metrics):
    plt.subplot(2, 2, i + 1)
    plt.plot(range(len(model2_results_df)), model2_results_df[metric], 
             label='Incremental Model', color='green')
    plt.title(f'{metric} Over Time')
    plt.xlabel('Test Windows')
    plt.ylabel(metric)
    plt.legend()
    
    plt.xticks(range(len(model2_results_df)), 
               model2_results_df['window'], 
               rotation=90,
               ha='right')
    
    plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Save results
model2_train_results_df.to_csv('model2_training_metrics.csv', index=False)
model2_results_df.to_csv('model2_test_metrics.csv', index=False)

# Store final results for comparison
model2_final_results = {
    'model_name': 'Incremental',
    'train_metrics': model2_train_results_df,
    'test_metrics': model2_results_df
}

# Save results for later comparison
np.save('model2_results.npy', model2_final_results, allow_pickle=True)





#  Model 3 

# Define time periods
initial_start = pd.Timestamp('2015-01-01')
initial_end = pd.Timestamp('2015-12-31')
training_end = pd.Timestamp('2017-11-30')
eval_end = pd.Timestamp('2018-01-31')

def create_model(learning_rate=0.001):
    model = keras.Sequential([
        keras.layers.Input(shape=(39,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy',
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tf.keras.metrics.AUC(curve='PR', name='auc_pr')]
    )
    return model


# -------------- Model 3: Dynamic Full train Model (Training) --------------
print("\nStarting Model 3 (Dynamic Full_train - Training)...")

# Initialize storage for models, scaler, and results
models = {}
scaler = StandardScaler()
model3_train_results = []
training_histories = []  # For storing training histories

# Set up training loop
current_date = pd.Timestamp('2016-02-01')  # Start from Feb 2016
last_valid_start = pd.Timestamp('2017-12-01')  # End at Dec 2017


# Training Loop
while current_date <= last_valid_start:
    # Define training window (accumulated data up to previous month end)
    train_end = current_date - pd.DateOffset(days=1)
    train_start = pd.Timestamp('2015-01-01')
  
    # Get accumulated training data
    train_mask = (df['order_date'] >= train_start) & (df['order_date'] <= train_end)
    train_data = df[train_mask]
    print(f"\nRetraining Model 3 for data up to {train_end.strftime('%Y-%m-%d')}...")
    print(f"Training data size: {len(train_data)}")

    # Prepare training data
    X_train = train_data.drop(columns=['fraud', 'order_date'])
    y_train = train_data['fraud']


    # Split and scale
    X_train, X_val, y_train, y_val = train_test_split(
      X_train, y_train,
      test_size=0.2,
      random_state=42
    )

    # Use initial scaler to all
    if current_date == pd.Timestamp('2016-02-01'):
        X_train_scaled = scaler.fit_transform(X_train)
    else:
       X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    current_model = create_model()
    history = current_model.fit(
      X_train_scaled, y_train,
      validation_data=(X_val_scaled, y_val),
      epochs=10,
      batch_size=512,
      verbose=1
    )

    models[train_end] = current_model
    training_histories.append({
      'window_end': train_end,
      'history': history
    })

    model3_train_results.append({
      'date': train_end,
      'train_loss': history.history['loss'][-1],
      'val_loss': history.history['val_loss'][-1],
      'train_accuracy': history.history['accuracy'][-1],
      'val_accuracy': history.history['val_accuracy'][-1],
      'train_auc_pr': history.history['auc_pr'][-1],
      'val_auc_pr': history.history['val_auc_pr'][-1]
    })

    current_date += pd.DateOffset(months=1)

# Convert training results to DataFrame
model3_train_results_df = pd.DataFrame(model3_train_results)
print("\nTraining Metrics (Model 3):")
print(model3_train_results_df)



# -------------- Model 3: Testing on Rolling Windows --------------
print("\nStarting Model 3 Testing...")

# Testing Loop
model3_results = []
current_date = pd.Timestamp('2016-02-01')

while current_date <= last_valid_start:
    test_start = current_date
    test_end_month = current_date.month + 1
    test_end_year = current_date.year
    if test_end_month > 12:
        test_end_month -= 12
        test_end_year += 1

    last_day = calendar.monthrange(test_end_year, test_end_month)[1]
    test_end = pd.Timestamp(f"{test_end_year}-{test_end_month:02d}-{last_day:02d}")

    train_end = current_date - pd.DateOffset(days=1)
    current_model = models[train_end]

    test_mask = (df['order_date'] > test_start) & (df['order_date'] <= test_end)
    test_data = df[test_mask]

    if len(test_data) > 0:
      X_test = test_data.drop(columns=['fraud', 'order_date'])
      y_test = test_data['fraud']
      X_test_scaled = scaler.transform(X_test)

      y_pred_proba = current_model.predict(X_test_scaled, verbose=0)
      threshold = np.quantile(y_pred_proba, 0.99)  # Top 1%
      y_pred = (y_pred_proba >= threshold).astype(int)
      print(f"Threshold value for this window: {threshold}")

      precision = precision_score(y_test, y_pred)
      recall = recall_score(y_test, y_pred)
      precisions, recalls, _ = precision_recall_curve(y_test, y_pred_proba)
      pr_auc = auc(recalls, precisions)

      model3_results.append({
          'date': current_date,
          'model': 'Full_train',
          'window': f"{test_start.strftime('%Y-%m')} to {test_end.strftime('%Y-%m')}",
          'PR-AUC': pr_auc,
          'Accuracy': accuracy_score(y_test, y_pred),
          'Precision': precision,   
          'Recall': recall,
          'threshold': threshold
      })

    current_date += pd.DateOffset(months=1)

# Print results
model3_results_df = pd.DataFrame(model3_results)
print("\nMetrics for each test window (Full_train Model):")
print(model3_results_df[['window', 'PR-AUC', 'Accuracy', 'Precision', 'Recall']])

print("\nAverage metrics (Full_train Model):")
average_metrics = model3_results_df[['PR-AUC', 'Accuracy', 'Precision', 'Recall']].mean()
print(average_metrics)

# Plot test metrics
metrics = ['PR-AUC', 'Accuracy', 'Precision', 'Recall']
plt.figure(figsize=(12, 8))

for i, metric in enumerate(metrics):
   plt.subplot(2, 2, i + 1)
   plt.plot(range(len(model3_results_df)), model3_results_df[metric], 
            label='Full_train Model', color='blue')
   plt.title(f'{metric} Over Time')
   plt.xlabel('Test Windows')
   plt.ylabel(metric)
   plt.legend()
   plt.xticks(range(len(model3_results_df)), 
              model3_results_df['window'], 
              rotation=90,
              ha='right')
   plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Save results
model3_train_results_df.to_csv('model3_Full_train Model_training_metrics.csv', index=False)
model3_results_df.to_csv('model3_Full_train Model_test_metrics.csv', index=False)

# Store final results for comparison
model3_final_results = {
    'model_name': 'Full_train Model',
    'train_metrics': model3_train_results_df,
    'test_metrics': model3_results_df
}

# Save results for later comparison
np.save('model3_results.npy', model3_final_results, allow_pickle=True)





# Load results
model1_results = np.load('model1_results.npy', allow_pickle=True).item()
model2_results = np.load('model2_results.npy', allow_pickle=True).item()
model3_results = np.load('model3_results.npy', allow_pickle=True).item()

# Create a dictionary of test metrics for each model
test_metrics = {
    'Static': model1_results['test_metrics'],
    'Incremental': model2_results['test_metrics'],
    'Full Training': model3_results['test_metrics']
}

# Calculate average metrics for each model
avg_metrics = {}
for model_name, metrics in test_metrics.items():
    avg_metrics[model_name] = {
        'PR-AUC': metrics['PR-AUC'].mean(),
        'Accuracy': metrics['Accuracy'].mean(),
        'Precision': metrics['Precision'].mean(),
        'Recall': metrics['Recall'].mean()
    }

# Convert to DataFrame for easy viewing
avg_metrics_df = pd.DataFrame(avg_metrics).round(4)
print("\nAverage Metrics Across All Test Windows:")
print(avg_metrics_df)

# Print detailed metrics for each window
print("\nDetailed Metrics for Each Test Window:")
metrics = ['PR-AUC', 'Accuracy', 'Precision', 'Recall']

for window_idx in range(len(test_metrics['Static'])):
    print(f"\nWindow {window_idx + 1}: {test_metrics['Static'].iloc[window_idx]['window']}")
    window_metrics = {}
    for model_name, results in test_metrics.items():
        window_metrics[model_name] = {
            'PR-AUC': results.iloc[window_idx]['PR-AUC'],
            'Accuracy': results.iloc[window_idx]['Accuracy'],
            'Precision': results.iloc[window_idx]['Precision'],
            'Recall': results.iloc[window_idx]['Recall']
        }
    window_df = pd.DataFrame(window_metrics).round(4)
    print(window_df)

# Find best performing model for each metric
best_models = {}
for metric in metrics:
    best_model = max(avg_metrics.items(), key=lambda x: x[1][metric])[0]
    best_models[metric] = best_model

print("\nBest Performing Models:")
for metric, model in best_models.items():
    print(f"{metric}: {model} ({avg_metrics[model][metric]:.4f})")
    
    
    
    
    
    # Load results
model1_results = np.load('model1_results.npy', allow_pickle=True).item()
model2_results = np.load('model2_results.npy', allow_pickle=True).item()
model3_results = np.load('model3_results.npy', allow_pickle=True).item()

# Create dictionary of test metrics
test_metrics = {
    'Static': model1_results['test_metrics'],
    'Incremental': model2_results['test_metrics'],
    'Full Training': model3_results['test_metrics']
}

# Plot metrics over time
plt.figure(figsize=(20, 15))
metrics = ['PR-AUC', 'Accuracy', 'Precision', 'Recall']
colors = ['blue', 'green', 'orange']

for i, metric in enumerate(metrics):
    plt.subplot(2, 2, i + 1)
    
    for j, (model_name, results) in enumerate(test_metrics.items()):
        plt.plot(range(len(results)), results[metric], 
                label=model_name, color=colors[j], linewidth=2)
    
    plt.title(f'{metric} Over Time', fontsize=14, pad=20)
    plt.xlabel('Test Windows', fontsize=12)
    plt.ylabel(metric, fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Format x-axis labels
    window_labels = results['window'].tolist()
    plt.xticks(range(len(window_labels)), window_labels, 
              rotation=90, ha='right', fontsize=8)
    
    # Adjust layout
    plt.gca().set_xticklabels(window_labels)
    plt.grid(True, linestyle='--', alpha=0.3)

# Adjust spacing
plt.tight_layout(pad=3.0)
plt.show()

# Optionally save the plot
plt.savefig('model_comparison_plots.png', dpi=300, bbox_inches='tight')

NameError: name 'pd' is not defined