In [1]:
# Cell 1: Load Data
import pandas as pd

# Load training and test data
train = pd.read_csv("../Data/Blood_samples_dataset_balanced_2(f).csv")
test = pd.read_csv("../Data/blood_samples_dataset_test.csv")

print("Training data shape:", train.shape)
print("Test data shape:", test.shape)
print("\nColumn names:")
print(train.columns.tolist())

Training data shape: (2351, 25)
Test data shape: (486, 25)

Column names:
['Glucose', 'Cholesterol', 'Hemoglobin', 'Platelets', 'White Blood Cells', 'Red Blood Cells', 'Hematocrit', 'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin', 'Mean Corpuscular Hemoglobin Concentration', 'Insulin', 'BMI', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Triglycerides', 'HbA1c', 'LDL Cholesterol', 'HDL Cholesterol', 'ALT', 'AST', 'Heart Rate', 'Creatinine', 'Troponin', 'C-reactive Protein', 'Disease']


In [2]:
# Cell 2: Explore Data
print("=== Training Data Info ===")
train.info()
print("\n=== Training Data Description ===")
print(train.describe())
print("\n=== Missing Values ===")
print(train.isnull().sum())
print("\n=== Target Distribution ===")
print(train['Disease'].value_counts())
print("\n=== Sample of training data ===")
print(train.head())

=== Training Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 25 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Glucose                                    2351 non-null   float64
 1   Cholesterol                                2351 non-null   float64
 2   Hemoglobin                                 2351 non-null   float64
 3   Platelets                                  2351 non-null   float64
 4   White Blood Cells                          2351 non-null   float64
 5   Red Blood Cells                            2351 non-null   float64
 6   Hematocrit                                 2351 non-null   float64
 7   Mean Corpuscular Volume                    2351 non-null   float64
 8   Mean Corpuscular Hemoglobin                2351 non-null   float64
 9   Mean Corpuscular Hemoglobin Concentration  2351 non-null   float64
 1

In [3]:
# Cell 3: Enhanced Data Preprocessing and Feature Engineering
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
import numpy as np

# Separate features and target
X = train.drop(columns=['Disease'])
y = train['Disease']

# Encode categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Prepare test data
if 'Disease' in test.columns:
    X_test = test.drop(columns=['Disease'])
else:
    X_test = test.copy()

# Feature Engineering
print("=== Feature Engineering ===")

# 1. Create ratio features
X['Glucose_to_HbA1c'] = X['Glucose'] / (X['HbA1c'] + 1e-8)
X['Cholesterol_to_HDL'] = X['Cholesterol'] / (X['HDL Cholesterol'] + 1e-8)
X['BMI_to_Insulin'] = X['BMI'] / (X['Insulin'] + 1e-8)
X['RBC_to_Hemoglobin'] = X['Red Blood Cells'] / (X['Hemoglobin'] + 1e-8)

# Add the same features to test data
X_test['Glucose_to_HbA1c'] = X_test['Glucose'] / (X_test['HbA1c'] + 1e-8)
X_test['Cholesterol_to_HDL'] = X_test['Cholesterol'] / (X_test['HDL Cholesterol'] + 1e-8)
X_test['BMI_to_Insulin'] = X_test['BMI'] / (X_test['Insulin'] + 1e-8)
X_test['RBC_to_Hemoglobin'] = X_test['Red Blood Cells'] / (X_test['Hemoglobin'] + 1e-8)

# 2. Create polynomial features for key variables
key_features = ['Glucose', 'Cholesterol', 'HbA1c', 'BMI']
for feature in key_features:
    X[f'{feature}_squared'] = X[feature] ** 2
    X_test[f'{feature}_squared'] = X_test[feature] ** 2

# 3. Create interaction features
X['Glucose_BMI_interaction'] = X['Glucose'] * X['BMI']
X['Cholesterol_Blood_Pressure'] = X['Cholesterol'] * X['Systolic Blood Pressure']
X_test['Glucose_BMI_interaction'] = X_test['Glucose'] * X_test['BMI']
X_test['Cholesterol_Blood_Pressure'] = X_test['Cholesterol'] * X_test['Systolic Blood Pressure']

print(f"Original features: {len(train.columns) - 1}")
print(f"Enhanced features: {len(X.columns)}")

# Use RobustScaler instead of StandardScaler (more robust to outliers)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print("Features shape after scaling:", X_scaled.shape)
print("Test features shape after scaling:", X_test_scaled.shape)
print("Encoded target classes:", label_encoder.classes_)
print("Target distribution:", pd.Series(y_encoded).value_counts())

=== Feature Engineering ===
Original features: 24
Enhanced features: 34
Features shape after scaling: (2351, 34)
Test features shape after scaling: (486, 34)
Encoded target classes: ['Anemia' 'Diabetes' 'Healthy' 'Thalasse' 'Thromboc']
Target distribution: 0    623
2    556
1    540
3    509
4    123
Name: count, dtype: int64
Target distribution: 0    623
2    556
1    540
3    509
4    123
Name: count, dtype: int64


In [None]:
# Cell 4: Advanced Model Training with Ensemble and Hyperparameter Optimization
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

print("=== Advanced Model Training ===")

# Split data for validation with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 1. Hyperparameter Optimization for LightGBM
print("1. Optimizing LightGBM hyperparameters...")
lgb_params = {
    'n_estimators': [200, 300, 500],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2],
    'num_leaves': [31, 50, 70],
    'feature_fraction': [0.8, 0.9, 1.0],
    'bagging_fraction': [0.8, 0.9, 1.0]
}

lgb_model = LGBMClassifier(random_state=42, verbose=-1)
lgb_grid = GridSearchCV(
    lgb_model, lgb_params, cv=3, scoring='accuracy', 
    n_jobs=-1, verbose=0
)
lgb_grid.fit(X_train, y_train)
best_lgb = lgb_grid.best_estimator_

print(f"Best LGB params: {lgb_grid.best_params_}")
print(f"Best LGB CV score: {lgb_grid.best_score_:.4f}")

# 2. Random Forest with optimization
print("\n2. Training optimized Random Forest...")
rf_params = {
    'n_estimators': [200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(
    rf_model, rf_params, cv=3, scoring='accuracy',
    n_jobs=-1, verbose=0
)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

print(f"Best RF params: {rf_grid.best_params_}")
print(f"Best RF CV score: {rf_grid.best_score_:.4f}")

# 3. Ensemble Model (Voting Classifier)
print("\n3. Creating ensemble model...")
ensemble_model = VotingClassifier(
    estimators=[
        ('lgb', best_lgb),
        ('rf', best_rf)
    ],
    voting='soft'  # Use probability predictions
)

ensemble_model.fit(X_train, y_train)

# 4. Cross-validation evaluation
print("\n4. Cross-validation evaluation...")
cv_scores = cross_val_score(
    ensemble_model, X_scaled, y_encoded, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy'
)

print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 5. Validation performance
val_preds = ensemble_model.predict(X_val)
accuracy = accuracy_score(y_val, val_preds)
print(f"\nValidation Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, val_preds, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_preds))

# Store the best model
model = ensemble_model

=== Advanced Model Training ===
1. Optimizing LightGBM hyperparameters...


In [None]:
# Cell 5: Enhanced Feature Importance Analysis
import matplotlib.pyplot as plt
import seaborn as sns

print("=== Feature Importance Analysis ===")

# Get feature importance from LightGBM model
lgb_importance = best_lgb.feature_importances_
rf_importance = best_rf.feature_importances_

# Create a comprehensive feature importance dataframe
feature_names = X.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'lgb_importance': lgb_importance,
    'rf_importance': rf_importance
})

# Calculate average importance
importance_df['avg_importance'] = (importance_df['lgb_importance'] + importance_df['rf_importance']) / 2
importance_df = importance_df.sort_values('avg_importance', ascending=False)

# Plot feature importance comparison
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Top 15 features for each model
top_n = 15

# LightGBM importance
top_lgb = importance_df.head(top_n)
axes[0, 0].barh(range(len(top_lgb)), top_lgb['lgb_importance'])
axes[0, 0].set_yticks(range(len(top_lgb)))
axes[0, 0].set_yticklabels(top_lgb['feature'])
axes[0, 0].set_title('LightGBM Feature Importance (Top 15)')
axes[0, 0].set_xlabel('Importance')

# Random Forest importance
axes[0, 1].barh(range(len(top_lgb)), top_lgb['rf_importance'])
axes[0, 1].set_yticks(range(len(top_lgb)))
axes[0, 1].set_yticklabels(top_lgb['feature'])
axes[0, 1].set_title('Random Forest Feature Importance (Top 15)')
axes[0, 1].set_xlabel('Importance')

# Average importance
axes[1, 0].barh(range(len(top_lgb)), top_lgb['avg_importance'])
axes[1, 0].set_yticks(range(len(top_lgb)))
axes[1, 0].set_yticklabels(top_lgb['feature'])
axes[1, 0].set_title('Average Feature Importance (Top 15)')
axes[1, 0].set_xlabel('Importance')

# Feature importance correlation
axes[1, 1].scatter(importance_df['lgb_importance'], importance_df['rf_importance'], alpha=0.6)
axes[1, 1].set_xlabel('LightGBM Importance')
axes[1, 1].set_ylabel('Random Forest Importance')
axes[1, 1].set_title('Feature Importance Correlation')
axes[1, 1].plot([0, importance_df['lgb_importance'].max()], 
                [0, importance_df['rf_importance'].max()], 'r--', alpha=0.5)

plt.tight_layout()
plt.show()

# Print top features
print("\nTop 15 most important features (averaged):")
for i, row in importance_df.head(15).iterrows():
    print(f"{row.name + 1:2d}. {row['feature']:25s}: LGB={row['lgb_importance']:6.3f}, RF={row['rf_importance']:6.3f}, Avg={row['avg_importance']:6.3f}")

# Show engineered vs original features performance
engineered_features = [f for f in feature_names if any(x in f for x in ['_to_', '_squared', '_interaction'])]
original_features = [f for f in feature_names if f not in engineered_features]

print(f"\nEngineered features in top 15: {len([f for f in top_lgb['feature'] if f in engineered_features])}")
print(f"Original features in top 15: {len([f for f in top_lgb['feature'] if f in original_features])}")

print("\nEngineered features in top 15:")
for f in top_lgb['feature']:
    if f in engineered_features:
        importance = importance_df[importance_df['feature'] == f]['avg_importance'].iloc[0]
        print(f"  - {f}: {importance:.3f}")

In [None]:
# Cell 6: Enhanced Submission with Confidence Analysis
import numpy as np

print("=== Creating Enhanced Submission ===")

# Make predictions on test data with probability estimates
test_preds_proba = ensemble_model.predict_proba(X_test_scaled)
test_preds_encoded = ensemble_model.predict(X_test_scaled)

# Decode predictions back to original labels
test_preds = label_encoder.inverse_transform(test_preds_encoded)

# Calculate prediction confidence (max probability)
prediction_confidence = np.max(test_preds_proba, axis=1)

# Create submission dataframe with IDs starting from 1 and 'label' column
submission = pd.DataFrame({
    'id': range(1, len(test_preds) + 1),
    'label': test_preds  # Changed to 'label' as requested
})

# Create enhanced submission with confidence scores
submission_with_confidence = pd.DataFrame({
    'id': range(1, len(test_preds) + 1),
    'label': test_preds,
    'confidence': prediction_confidence
})

# Create outputs directory if it doesn't exist
import os
os.makedirs("../outputs", exist_ok=True)

# Save main submission (id, label format)
submission.to_csv("../outputs/submission.csv", index=False)
print(f"Main submission saved! Shape: {submission.shape}")

# Save enhanced submission with confidence
submission_with_confidence.to_csv("../outputs/submission_with_confidence.csv", index=False)
print("Enhanced submission with confidence scores saved!")

# Save different format options
submission.to_csv("../outputs/submission_tab.csv", index=False, sep='\t')
submission.to_csv("../outputs/submission_space.txt", index=False, sep=' ')
print("Alternative formats saved (tab and space separated)")

# Analysis of predictions
print("\n=== Prediction Analysis ===")
print("First few predictions:")
print(submission.head(10))

print(f"\nPrediction distribution:")
pred_dist = submission['label'].value_counts()
print(pred_dist)

print(f"\nPrediction percentages:")
for disease, count in pred_dist.items():
    percentage = (count / len(submission)) * 100
    print(f"{disease}: {count} ({percentage:.1f}%)")

print(f"\nConfidence Statistics:")
print(f"Mean confidence: {prediction_confidence.mean():.3f}")
print(f"Min confidence: {prediction_confidence.min():.3f}")
print(f"Max confidence: {prediction_confidence.max():.3f}")
print(f"Std confidence: {prediction_confidence.std():.3f}")

# Show low confidence predictions
low_confidence_threshold = 0.5
low_conf_count = sum(prediction_confidence < low_confidence_threshold)
print(f"\nPredictions with confidence < {low_confidence_threshold}: {low_conf_count} ({low_conf_count/len(submission)*100:.1f}%)")

if low_conf_count > 0:
    print("Sample low confidence predictions:")
    low_conf_idx = np.where(prediction_confidence < low_confidence_threshold)[0][:5]
    for idx in low_conf_idx:
        print(f"ID {idx+1}: {test_preds[idx]} (confidence: {prediction_confidence[idx]:.3f})")

print(f"\n=== Model Performance Summary ===")
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Validation accuracy: {accuracy:.4f}")
print(f"Total predictions: {len(submission)}")
print(f"Average prediction confidence: {prediction_confidence.mean():.3f}")