# 🔶 Blood Sample Classification Model with XGBoost

This notebook implements a robust classification model for blood sample analysis, focusing on simplicity and accuracy.

## 🔶 Step 1: Data Collection and Setup

First, we'll load the necessary libraries and datasets.

In [None]:
# Essential imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Load datasets
print("Loading datasets...")
train = pd.read_csv("../Data/Blood_samples_dataset_balanced_2(f).csv")
test = pd.read_csv("../Data/blood_samples_dataset_test.csv")

print(f"Training data shape: {train.shape}")
print(f"Testing data shape: {test.shape}")

## 🔶 Step 2: Data Exploration

Let's explore the data to understand its structure and detect any issues.

In [None]:
# Quick overview of the data
print("\n=== Target Variable Distribution ===")
print(train['Disease'].value_counts())
print("\n=== Missing Values in Train ===")
print(train.isnull().sum().sum())
print("\n=== Missing Values in Test ===")
print(test.isnull().sum().sum())

# Check for data type issues
print("\n=== Data Types ===")
print(train.dtypes)

# Statistical summary
print("\n=== Statistical Summary ===")
print(train.describe().T)

# Examine a few samples
print("\n=== Sample Data ===")
print(train.head(3))

## 🔶 Step 3: Data Preprocessing

We'll prepare the data for modeling by:
1. Separating features and target
2. Encoding categorical variables
3. Handling any issues detected in exploration
4. Scaling the features

In [None]:
# Feature preparation
print("Preparing features and target...")

# Separate features and target for train data
X = train.drop(columns=['Disease'])
y = train['Disease']

# Prepare test data
if 'Disease' in test.columns:
    X_test = test.drop(columns=['Disease'])
else:
    X_test = test.copy()

# Check and ensure feature consistency between train and test
missing_cols = set(X.columns) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(X.columns)

if missing_cols:
    print(f"Warning: Test data missing columns: {missing_cols}")
if extra_cols:
    print(f"Warning: Test data has extra columns: {extra_cols}")
    X_test = X_test.drop(columns=list(extra_cols))

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Classes:", label_encoder.classes_)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print(f"Prepared {X_scaled.shape[1]} features for training")
print(f"Target distribution:")
for i, label in enumerate(label_encoder.classes_):
    count = (y_encoded == i).sum()
    print(f"  {label}: {count} ({count/len(y_encoded)*100:.1f}%)")

## 🔶 Step 4: Model Training

We'll use XGBoost, a powerful gradient boosting algorithm known for its effectiveness in classification tasks.

In [None]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.25,
    random_state=42,
    stratify=y_encoded  # Maintain class distribution
)

print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}")

# Train XGBoost classifier with robust parameters
print("\nTraining XGBoost classifier...")
xgb_model = XGBClassifier(
    n_estimators=200,          # Number of trees
    learning_rate=0.1,         # Learning rate
    max_depth=6,               # Maximum tree depth
    min_child_weight=1,        # Minimum sum of instance weight needed in a child
    subsample=0.8,             # Subsample ratio of training instances
    colsample_bytree=0.8,      # Subsample ratio of columns for each tree
    objective='multi:softprob', # Multi-class probability
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'     # Evaluation metric
)

# Train with early stopping using validation set
eval_set = [(X_val, y_val)]
xgb_model.fit(
    X_train, y_train,
    eval_set=eval_set,
    early_stopping_rounds=20,
    verbose=True
)

# Evaluate on validation set
val_preds = xgb_model.predict(X_val)
accuracy = accuracy_score(y_val, val_preds)
print(f"\nValidation Accuracy: {accuracy:.4f}")

# Show detailed classification report
print("\nClassification Report:")
print(classification_report(y_val, val_preds, target_names=label_encoder.classes_))

# Cross-validation to ensure model stability
cv_scores = cross_val_score(
    xgb_model, X_scaled, y_encoded,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy'
)
print(f"\nCross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

## 🔶 Step 5: Feature Importance

Let's examine which features are most important for our model.

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 8))
importance = xgb_model.feature_importances_
indices = np.argsort(importance)[-20:]  # Top 20 features
plt.barh(range(len(indices)), importance[indices])
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.title('Top 20 Most Important Features')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

# Print top features
print("\nTop 10 Most Important Features:")
for i in indices[-10:]:
    print(f"{X.columns[i]}: {importance[i]:.4f}")

## 🔶 Step 6: Generate Predictions and Submission File

Finally, we'll make predictions on the test data and create a submission file.

In [None]:
# Generate predictions on test data
print("Generating predictions on test data...")
test_preds_prob = xgb_model.predict_proba(X_test_scaled)
test_preds = xgb_model.predict(X_test_scaled)

# Decode predictions to original labels
test_preds_labels = label_encoder.inverse_transform(test_preds)

# Calculate prediction confidence
prediction_confidence = np.max(test_preds_prob, axis=1)

# Create submission dataframe
submission = pd.DataFrame({
    'id': range(1, len(test_preds_labels) + 1),
    'label': test_preds_labels
})

# Create outputs directory if it doesn't exist
import os
os.makedirs("../outputs", exist_ok=True)

# Save submission file
submission.to_csv("../outputs/submission_xgboost.csv", index=False)
print(f"Submission file saved with {len(submission)} predictions")

# Print prediction distribution
print("\nPrediction Distribution:")
pred_counts = pd.Series(test_preds_labels).value_counts()
for label, count in pred_counts.items():
    print(f"  {label}: {count} ({count/len(test_preds_labels)*100:.1f}%)")

# Print confidence statistics
print("\nConfidence Statistics:")
print(f"  Mean: {prediction_confidence.mean():.4f}")
print(f"  Min: {prediction_confidence.min():.4f}")
print(f"  Max: {prediction_confidence.max():.4f}")

# Display sample predictions
print("\nSample Predictions (first 10):")
sample_df = pd.DataFrame({
    'id': range(1, 11),
    'predicted_label': test_preds_labels[:10],
    'confidence': prediction_confidence[:10]
})
print(sample_df)