In [1]:
# Essential imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Load datasets
print("Loading datasets...")
train = pd.read_csv("../Data/Blood_samples_dataset_balanced_2(f).csv")
test = pd.read_csv("../Data/blood_samples_dataset_test.csv")

print(f"Training data shape: {train.shape}")
print(f"Testing data shape: {test.shape}")

Loading datasets...
Training data shape: (2351, 25)
Testing data shape: (486, 25)


In [2]:
# Data Exploration and Understanding
print("Training data info:")
print(train.info())
print("\nTraining data head:")
print(train.head())

print("\nTarget variable distribution:")
if 'Disease' in train.columns:
    target_col = 'Disease'
elif 'label' in train.columns:
    target_col = 'label'
else:
    # Find the target column (usually the last column or named appropriately)
    target_col = train.columns[-1]
    print(f"Assuming target column is: {target_col}")

print(train[target_col].value_counts())

print("\nMissing values:")
print(train.isnull().sum().sum())
print("\nTest data shape and missing values:")
print(f"Test shape: {test.shape}")
print(f"Test missing values: {test.isnull().sum().sum()}")

Training data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 25 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Glucose                                    2351 non-null   float64
 1   Cholesterol                                2351 non-null   float64
 2   Hemoglobin                                 2351 non-null   float64
 3   Platelets                                  2351 non-null   float64
 4   White Blood Cells                          2351 non-null   float64
 5   Red Blood Cells                            2351 non-null   float64
 6   Hematocrit                                 2351 non-null   float64
 7   Mean Corpuscular Volume                    2351 non-null   float64
 8   Mean Corpuscular Hemoglobin                2351 non-null   float64
 9   Mean Corpuscular Hemoglobin Concentration  2351 non-null   float64
 10  Insu

In [3]:
# Data Preprocessing
# Separate features and target
X = train.drop(target_col, axis=1)
y = train[target_col]

# Handle test data (assuming it doesn't have target column)
X_test = test.copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Test features shape: {X_test.shape}")

# Check if test data has target column and remove it
if target_col in X_test.columns:
    print(f"Removing {target_col} from test data")
    X_test = X_test.drop(target_col, axis=1)
    print(f"Test features shape after removing target: {X_test.shape}")

# Check feature alignment
print(f"\nTraining features: {list(X.columns)}")
print(f"Test features: {list(X_test.columns)}")

# Align features between train and test
missing_in_test = set(X.columns) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(X.columns)

if missing_in_test:
    print(f"Features missing in test: {missing_in_test}")
    # Add missing columns with zeros
    for col in missing_in_test:
        X_test[col] = 0

if missing_in_train:
    print(f"Features missing in train: {missing_in_train}")
    # Remove extra columns from test
    X_test = X_test.drop(columns=missing_in_train)

# Reorder columns to match training data
X_test = X_test[X.columns]

print(f"\nAligned shapes - Train: {X.shape}, Test: {X_test.shape}")

# Encode categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nTarget classes: {label_encoder.classes_}")
print(f"Encoded target distribution: {np.bincount(y_encoded)}")

# Handle any remaining missing values
if X.isnull().sum().sum() > 0:
    print("Filling missing values in training data...")
    X = X.fillna(X.median())
    
if X_test.isnull().sum().sum() > 0:
    print("Filling missing values in test data...")
    X_test = X_test.fillna(X_test.median())

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print(f"\nScaled features shape: {X_scaled.shape}")
print(f"Scaled test features shape: {X_test_scaled.shape}")

Features shape: (2351, 24)
Target shape: (2351,)
Test features shape: (486, 25)
Removing Disease from test data
Test features shape after removing target: (486, 24)

Training features: ['Glucose', 'Cholesterol', 'Hemoglobin', 'Platelets', 'White Blood Cells', 'Red Blood Cells', 'Hematocrit', 'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin', 'Mean Corpuscular Hemoglobin Concentration', 'Insulin', 'BMI', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Triglycerides', 'HbA1c', 'LDL Cholesterol', 'HDL Cholesterol', 'ALT', 'AST', 'Heart Rate', 'Creatinine', 'Troponin', 'C-reactive Protein']
Test features: ['Glucose', 'Cholesterol', 'Hemoglobin', 'Platelets', 'White Blood Cells', 'Red Blood Cells', 'Hematocrit', 'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin', 'Mean Corpuscular Hemoglobin Concentration', 'Insulin', 'BMI', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Triglycerides', 'HbA1c', 'LDL Cholesterol', 'HDL Cholesterol', 'ALT', 'AST', 'Heart Rate', 

In [4]:
# Step 1: Better Data Splitting and Validation
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

# Split your training data into train/validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"Training set: {X_train_split.shape[0]} samples")
print(f"Validation set: {X_val_split.shape[0]} samples")

Training set: 1880 samples
Validation set: 471 samples


In [5]:
# Step 2: Regularized XGBoost Model with Early Stopping
from xgboost import XGBClassifier

# More conservative hyperparameters to prevent overfitting
xgb_regularized = XGBClassifier(
    n_estimators=100,          # Reduced from 200
    learning_rate=0.05,        # Reduced from 0.1
    max_depth=4,               # Reduced from 6
    min_child_weight=3,        # Increased from 1
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,               # L1 regularization
    reg_lambda=1,              # L2 regularization
    gamma=1,                   # Minimum split loss
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False,
    early_stopping_rounds=20   # Stop if no improvement for 20 rounds
)

# Train with early stopping
xgb_regularized.fit(
    X_train_split, y_train_split,
    eval_set=[(X_val_split, y_val_split)],
    verbose=10
)

# Evaluate on validation set
val_preds = xgb_regularized.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, val_preds)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# Check training accuracy to see overfitting
train_preds = xgb_regularized.predict(X_train_split)
train_accuracy = accuracy_score(y_train_split, train_preds)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Overfitting Gap: {train_accuracy - val_accuracy:.4f}")

[0]	validation_0-mlogloss:1.50382
[10]	validation_0-mlogloss:0.83284
[20]	validation_0-mlogloss:0.51282
[30]	validation_0-mlogloss:0.32427
[40]	validation_0-mlogloss:0.21587
[50]	validation_0-mlogloss:0.14556
[60]	validation_0-mlogloss:0.10130
[70]	validation_0-mlogloss:0.07336
[80]	validation_0-mlogloss:0.05517
[90]	validation_0-mlogloss:0.04389
[99]	validation_0-mlogloss:0.03780

Validation Accuracy: 1.0000
Training Accuracy: 1.0000
Overfitting Gap: 0.0000


In [6]:
# Step 3: Cross-Validation for Better Evaluation
from sklearn.model_selection import cross_val_score

# Create a new XGBoost model without early stopping for cross-validation
xgb_cv = XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    gamma=1,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False
    # No early_stopping_rounds for cross-validation
)

cv_scores = cross_val_score(
    xgb_cv, X_scaled, y_encoded, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy'
)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

Cross-validation scores: [1. 1. 1. 1. 1.]
Mean CV accuracy: 1.0000 (+/- 0.0000)


In [7]:
# Step 4: Alternative - Random Forest (Often Less Prone to Overfitting)
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_split, y_train_split)

# Evaluate Random Forest
rf_val_preds = rf_model.predict(X_val_split)
rf_val_accuracy = accuracy_score(y_val_split, rf_val_preds)
rf_train_preds = rf_model.predict(X_train_split)
rf_train_accuracy = accuracy_score(y_train_split, rf_train_preds)

print(f"\nRandom Forest Results:")
print(f"Training Accuracy: {rf_train_accuracy:.4f}")
print(f"Validation Accuracy: {rf_val_accuracy:.4f}")
print(f"Overfitting Gap: {rf_train_accuracy - rf_val_accuracy:.4f}")


Random Forest Results:
Training Accuracy: 1.0000
Validation Accuracy: 1.0000
Overfitting Gap: 0.0000


In [8]:
# Step 5: Ensemble Method (Often More Robust)
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

# Create ensemble of different models
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Use the cross-validation XGBoost model (without early stopping) for ensemble
ensemble = VotingClassifier(
    estimators=[
        ('xgb', xgb_cv),  # Use xgb_cv instead of xgb_regularized
        ('rf', rf_model),
        ('lr', log_reg)
    ],
    voting='soft'  # Use probability averages
)

ensemble.fit(X_train_split, y_train_split)

# Evaluate ensemble
ensemble_val_preds = ensemble.predict(X_val_split)
ensemble_val_accuracy = accuracy_score(y_val_split, ensemble_val_preds)
print(f"\nEnsemble Validation Accuracy: {ensemble_val_accuracy:.4f}")


Ensemble Validation Accuracy: 1.0000


In [9]:
# Step 6: Feature Selection to Reduce Overfitting
from sklearn.feature_selection import SelectKBest, f_classif

# Select top K features
k_best = SelectKBest(score_func=f_classif, k=15)  # Select top 15 features
X_train_selected = k_best.fit_transform(X_train_split, y_train_split)
X_val_selected = k_best.transform(X_val_split)

# Train model on selected features
xgb_selected = XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=3,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    use_label_encoder=False
)

xgb_selected.fit(X_train_selected, y_train_split)

# Evaluate
val_preds_selected = xgb_selected.predict(X_val_selected)
val_acc_selected = accuracy_score(y_val_split, val_preds_selected)
print(f"\nFeature Selected Model Validation Accuracy: {val_acc_selected:.4f}")

# Show selected features
selected_features = X.columns[k_best.get_support()]
print(f"Selected features: {list(selected_features)}")


Feature Selected Model Validation Accuracy: 1.0000
Selected features: ['Glucose', 'Cholesterol', 'Platelets', 'White Blood Cells', 'Red Blood Cells', 'Hematocrit', 'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin', 'Insulin', 'BMI', 'HbA1c', 'HDL Cholesterol', 'ALT', 'Heart Rate', 'Troponin']


In [10]:
# Step 7: Generate Final Predictions with Best Model
# Choose the model with best validation performance and smallest overfitting gap

# Select the best model based on validation performance
models = {
    'XGBoost Regularized': (xgb_regularized, X_test_scaled),
    'Random Forest': (rf_model, X_test_scaled),
    'Ensemble': (ensemble, X_test_scaled),
    'Feature Selected': (xgb_selected, k_best.transform(X_test_scaled))
}

best_model_name = 'XGBoost Regularized'  # You can change this based on results
best_model, X_test_final = models[best_model_name]

print(f"Using {best_model_name} for final predictions")

# Generate predictions
final_preds = best_model.predict(X_test_final)
final_preds_labels = label_encoder.inverse_transform(final_preds)

# Create submission
submission = pd.DataFrame({
    'id': range(1, len(final_preds_labels) + 1),
    'label': final_preds_labels
})

submission.to_csv("../outputs/submission_regularized.csv", index=False)
print(f"Submission saved with validation accuracy: {val_accuracy:.4f}")

Using XGBoost Regularized for final predictions
Submission saved with validation accuracy: 1.0000


In [11]:
# Improved approach to fix overfitting
print("="*50)
print("ADDRESSING OVERFITTING ISSUES")
print("="*50)

# 1. First, let's check if there are any common IDs or clear data leakage between train and test
print("Checking for potential data leakage...")
if 'id' in train.columns and 'id' in test.columns:
    common_ids = set(train['id']) & set(test['id'])
    print(f"Number of common IDs between train and test: {len(common_ids)}")
    if len(common_ids) > 0:
        print("WARNING: Found common IDs between train and test - possible data leakage!")

# 2. Try more extreme regularization and simpler model
print("\nTraining with more extreme regularization...")
xgb_extreme_reg = XGBClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=3,
    min_child_weight=5,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=10,
    reg_lambda=10,
    gamma=3,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False
)

# Train with the full training set (for final model)
xgb_extreme_reg.fit(X_scaled, y_encoded)

# 3. Create a new test-train split with shuffling to check if it's a data ordering issue
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.2, 
    random_state=99,  # Different random state
    stratify=y_encoded,
    shuffle=True  # Explicitly shuffle
)

# Train a simpler model on this new split
xgb_simple = XGBClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=3,
    min_child_weight=5,
    reg_alpha=10,
    reg_lambda=10,
    random_state=42,
    use_label_encoder=False
)

xgb_simple.fit(X_train_new, y_train_new)
val_preds_simple = xgb_simple.predict(X_val_new)
val_acc_simple = accuracy_score(y_val_new, val_preds_simple)
print(f"New validation split accuracy with simpler model: {val_acc_simple:.4f}")

# 4. Create a submission with the extremely regularized model
final_preds_extreme = xgb_extreme_reg.predict(X_test_scaled)
final_preds_labels_extreme = label_encoder.inverse_transform(final_preds_extreme)

submission_extreme = pd.DataFrame({
    'id': range(1, len(final_preds_labels_extreme) + 1),
    'label': final_preds_labels_extreme
})

submission_extreme.to_csv("../outputs/submission_extreme_reg.csv", index=False)
print(f"Created submission with extremely regularized model")

# 5. Check feature importance for potential problematic features
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_extreme_reg.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 important features:")
print(feature_importance.head(10))

# 6. Try using only the top important features
top_features = feature_importance.head(10)['feature'].tolist()
print(f"\nTraining with only top {len(top_features)} features...")

X_top = X[top_features]
X_test_top = X_test[top_features]

# Scale these features
scaler_top = StandardScaler()
X_top_scaled = scaler_top.fit_transform(X_top)
X_test_top_scaled = scaler_top.transform(X_test_top)

# Train a model with only top features
xgb_top = XGBClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=3,
    min_child_weight=5,
    reg_alpha=10,
    reg_lambda=10,
    random_state=42,
    use_label_encoder=False
)

# Split with top features
X_train_top, X_val_top, y_train_top, y_val_top = train_test_split(
    X_top_scaled, y_encoded, 
    test_size=0.2, 
    random_state=101,
    stratify=y_encoded
)

xgb_top.fit(X_train_top, y_train_top)
val_preds_top = xgb_top.predict(X_val_top)
val_acc_top = accuracy_score(y_val_top, val_preds_top)
print(f"Validation accuracy with top features: {val_acc_top:.4f}")

# Make predictions with this model
final_preds_top = xgb_top.predict(X_test_top_scaled)
final_preds_labels_top = label_encoder.inverse_transform(final_preds_top)

submission_top = pd.DataFrame({
    'id': range(1, len(final_preds_labels_top) + 1),
    'label': final_preds_labels_top
})

submission_top.to_csv("../outputs/submission_top_features.csv", index=False)

ADDRESSING OVERFITTING ISSUES
Checking for potential data leakage...

Training with more extreme regularization...
New validation split accuracy with simpler model: 0.9427
Created submission with extremely regularized model

Top 10 important features:
                        feature  importance
8   Mean Corpuscular Hemoglobin    0.070073
6                    Hematocrit    0.065316
4             White Blood Cells    0.061285
17              HDL Cholesterol    0.059071
19                          AST    0.058462
22                     Troponin    0.052791
7       Mean Corpuscular Volume    0.050955
15                        HbA1c    0.047888
20                   Heart Rate    0.045035
3                     Platelets    0.041715

Training with only top 10 features...
Validation accuracy with top features: 0.9533


In [12]:
# Additional strategies to improve model generalization
print("="*50)
print("ADDITIONAL ANTI-OVERFITTING STRATEGIES")
print("="*50)

# 1. Data inspection - check for near-duplicates or unusual patterns
print("Checking for potential duplicates or unusual patterns...")
from sklearn.neighbors import NearestNeighbors

# Look for very similar samples using nearest neighbors
nn = NearestNeighbors(n_neighbors=2)
nn.fit(X_scaled)
distances, indices = nn.kneighbors(X_scaled)

# The first nearest neighbor is the point itself (distance=0)
# The second nearest is the closest different point
closest_distances = distances[:, 1]
very_close_samples = np.where(closest_distances < 0.1)[0]
print(f"Number of samples with very similar neighbors: {len(very_close_samples)}")

if len(very_close_samples) > 0:
    print("This could indicate near-duplicates in the training data")

# 2. Try a completely different model type
print("\nTrying a gradient boosting model with different implementation...")
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=3,
    min_samples_split=10,
    min_samples_leaf=5,
    subsample=0.7,
    random_state=42
)

X_train_gb, X_val_gb, y_train_gb, y_val_gb = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.2, 
    random_state=123,
    stratify=y_encoded
)

gb_model.fit(X_train_gb, y_train_gb)
val_preds_gb = gb_model.predict(X_val_gb)
val_acc_gb = accuracy_score(y_val_gb, val_preds_gb)
print(f"Validation accuracy with GradientBoostingClassifier: {val_acc_gb:.4f}")

# Create a submission with this model
final_preds_gb = gb_model.predict(X_test_scaled)
final_preds_labels_gb = label_encoder.inverse_transform(final_preds_gb)

submission_gb = pd.DataFrame({
    'id': range(1, len(final_preds_labels_gb) + 1),
    'label': final_preds_labels_gb
})

submission_gb.to_csv("../outputs/submission_gb.csv", index=False)
print(f"Created submission with GradientBoostingClassifier")

# 3. Try data augmentation with noise to improve generalization
print("\nTrying data augmentation with noise...")
from sklearn.utils import resample

# Create a noisy version of the training data
np.random.seed(42)
noise_factor = 0.01  # Add 1% noise
X_noisy = X_scaled + np.random.normal(0, noise_factor, X_scaled.shape)

# Combine original and noisy data
X_augmented = np.vstack([X_scaled, X_noisy])
y_augmented = np.hstack([y_encoded, y_encoded])

# Train a model on the augmented data
xgb_augmented = XGBClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=3,
    min_child_weight=5,
    reg_alpha=5,
    reg_lambda=5,
    gamma=1,
    random_state=42,
    use_label_encoder=False
)

# New validation split with augmented data
X_train_aug, X_val_aug, y_train_aug, y_val_aug = train_test_split(
    X_augmented, y_augmented, 
    test_size=0.1,  # Smaller validation set
    random_state=987,
    stratify=y_augmented
)

xgb_augmented.fit(X_train_aug, y_train_aug)

# Evaluate on original validation data to avoid noise in evaluation
X_pure_val, _, y_pure_val, _ = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.9,  # Take 10% for clean validation
    random_state=987,
    stratify=y_encoded
)

val_preds_aug = xgb_augmented.predict(X_pure_val)
val_acc_aug = accuracy_score(y_pure_val, val_preds_aug)
print(f"Validation accuracy with augmented data: {val_acc_aug:.4f}")

# Create submission with augmented model
final_preds_aug = xgb_augmented.predict(X_test_scaled)
final_preds_labels_aug = label_encoder.inverse_transform(final_preds_aug)

submission_aug = pd.DataFrame({
    'id': range(1, len(final_preds_labels_aug) + 1),
    'label': final_preds_labels_aug
})

submission_aug.to_csv("../outputs/submission_augmented.csv", index=False)
print(f"Created submission with augmented data model")

print("\nYou now have multiple new submissions to try!")

ADDITIONAL ANTI-OVERFITTING STRATEGIES
Checking for potential duplicates or unusual patterns...
Number of samples with very similar neighbors: 2351
This could indicate near-duplicates in the training data

Trying a gradient boosting model with different implementation...
Validation accuracy with GradientBoostingClassifier: 0.9809
Created submission with GradientBoostingClassifier

Trying data augmentation with noise...
Validation accuracy with augmented data: 1.0000
Created submission with augmented data model

You now have multiple new submissions to try!


In [13]:
# Final approach: Handling dataset issues and creating robust models
print("="*50)
print("FINAL ROBUST MODELING APPROACH")
print("="*50)

# 1. Let's first check if there are any potential identifier columns or data leakage issues
print("Inspecting features for potential identifiers or leakage...")

# Calculate variance of each feature to find potential ID-like columns
feature_variance = pd.DataFrame({
    'feature': X.columns,
    'variance': np.var(X_scaled, axis=0)
}).sort_values('variance', ascending=True)

print("Features with lowest variance (potential constants or near-constants):")
print(feature_variance.head(5))

print("\nFeatures with highest variance (potential identifiers or noisy features):")
print(feature_variance.tail(5))

# 2. Create a robust ensemble of different model types with high regularization
print("\nCreating a robust ensemble of diverse models...")

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Very strong regularization for all models
logistic = LogisticRegression(C=0.1, max_iter=1000, random_state=42)
xgb_final = XGBClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=2,
    min_child_weight=10,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_alpha=20,
    reg_lambda=20,
    random_state=42,
    use_label_encoder=False
)
rf_final = RandomForestClassifier(
    n_estimators=50,
    max_depth=5,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    random_state=42
)
gb_final = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=2,
    min_samples_split=20,
    min_samples_leaf=10,
    subsample=0.6,
    random_state=42
)
gnb = GaussianNB()

# Create the ensemble with hard voting (majority rule)
# Hard voting is more robust to overfitting than soft voting
final_ensemble = VotingClassifier(
    estimators=[
        ('lr', logistic),
        ('xgb', xgb_final),
        ('rf', rf_final),
        ('gb', gb_final),
        ('gnb', gnb)
    ],
    voting='hard'
)

# 3. Train on a subset of features (removing potential problematic ones)
print("\nTraining with a curated subset of features...")

# Remove features with extremely high or low variance
low_var_features = feature_variance.head(3)['feature'].tolist()
high_var_features = feature_variance.tail(3)['feature'].tolist()
problematic_features = low_var_features + high_var_features

curated_features = [f for f in X.columns if f not in problematic_features]
print(f"Using {len(curated_features)} curated features: {curated_features}")

X_curated = X[curated_features]
X_test_curated = X_test[curated_features]

# Scale these features
scaler_curated = StandardScaler()
X_curated_scaled = scaler_curated.fit_transform(X_curated)
X_test_curated_scaled = scaler_curated.transform(X_test_curated)

# Train the ensemble on the curated features
final_ensemble.fit(X_curated_scaled, y_encoded)

# 4. Create a submission with this robust approach
final_preds_robust = final_ensemble.predict(X_test_curated_scaled)
final_preds_labels_robust = label_encoder.inverse_transform(final_preds_robust)

submission_robust = pd.DataFrame({
    'id': range(1, len(final_preds_labels_robust) + 1),
    'label': final_preds_labels_robust
})

submission_robust.to_csv("../outputs/submission_robust_ensemble.csv", index=False)
print(f"Created submission with robust ensemble on curated features")

# 5. One more attempt using only minimal set of the most important biological features
# These are features that should be most predictive for the actual diseases based on domain knowledge
print("\nCreating a final model using only key biological markers...")

key_markers = [
    'Hemoglobin',         # Key for Anemia and Thalassemia
    'Red Blood Cells',    # Key for Anemia and Thalassemia
    'White Blood Cells',  # Important for infections and immunity
    'Platelets',          # Key for Thrombocytopenia
    'Glucose',            # Key for Diabetes
    'HbA1c',              # Key for Diabetes
    'Insulin'             # Key for Diabetes
]

print(f"Using only {len(key_markers)} essential biological markers: {key_markers}")

X_key = X[key_markers]
X_test_key = X_test[key_markers]

# Scale these features
scaler_key = StandardScaler()
X_key_scaled = scaler_key.fit_transform(X_key)
X_test_key_scaled = scaler_key.transform(X_test_key)

# Create a simple model with these features
# Use GradientBoosting which showed good validation performance earlier
gb_key = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.01,
    max_depth=3,
    min_samples_split=10,
    min_samples_leaf=5,
    subsample=0.7,
    random_state=42
)

gb_key.fit(X_key_scaled, y_encoded)

# Generate predictions
final_preds_key = gb_key.predict(X_test_key_scaled)
final_preds_labels_key = label_encoder.inverse_transform(final_preds_key)

submission_key = pd.DataFrame({
    'id': range(1, len(final_preds_labels_key) + 1),
    'label': final_preds_labels_key
})

submission_key.to_csv("../outputs/submission_key_markers.csv", index=False)
print(f"Created submission using only essential biological markers")

print("\nYou now have a total of 7 new submission files to try!")
print("1. submission_extreme_reg.csv - XGBoost with extreme regularization")
print("2. submission_top_features.csv - XGBoost with only top 10 important features")
print("3. submission_gb.csv - GradientBoostingClassifier model")
print("4. submission_augmented.csv - XGBoost with data augmentation")
print("5. submission_robust_ensemble.csv - Ensemble of diverse models on curated features")
print("6. submission_key_markers.csv - GradientBoosting with only essential biological markers")
print("7. Your original regularized model for comparison")

FINAL ROBUST MODELING APPROACH
Inspecting features for potential identifiers or leakage...
Features with lowest variance (potential constants or near-constants):
              feature  variance
21         Creatinine       1.0
20         Heart Rate       1.0
4   White Blood Cells       1.0
0             Glucose       1.0
19                AST       1.0

Features with highest variance (potential identifiers or noisy features):
                     feature  variance
13  Diastolic Blood Pressure       1.0
7    Mean Corpuscular Volume       1.0
15                     HbA1c       1.0
16           LDL Cholesterol       1.0
5            Red Blood Cells       1.0

Creating a robust ensemble of diverse models...

Training with a curated subset of features...
Using 18 curated features: ['Glucose', 'Cholesterol', 'Hemoglobin', 'Platelets', 'Hematocrit', 'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin', 'Mean Corpuscular Hemoglobin Concentration', 'Insulin', 'BMI', 'Systolic Blood Pressure'