# 🩺 AI Powered Leukemia Detection Model
This notebook implements an AI-powered model to detect leukemia using machine learning techniques. It includes data preprocessing, model training, evaluation, and prediction functionalities.

# Standard Imports
This cell imports all the necessary libraries and modules required for data processing, visualization, and machine learning.

In [1]:
# Standard imports - DO NOT CHANGE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')

# Leukemia Detection Model Class
This cell defines the `LeukemiaDetectionModel` class, which includes methods for loading data, preprocessing, training models, and making predictions.

In [5]:
class LeukemiaDetectionModel:
    def __init__(self, target_column=None, id_column=None):
        """Initialize model with optional column detection"""
        self.target_column = target_column
        self.id_column = id_column
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.trained_models = None

    def auto_detect_columns(self):
        """Automatically detect target and ID columns based on keywords"""
        if self.target_column is None:
            for col in self.df.columns:
                if 'leukemia' in col.lower() and 'status' in col.lower():
                    self.target_column = col
                    print(f"Detected target column: {self.target_column}")
                    break
            if self.target_column is None:
                raise ValueError("Unable to detect target column related to leukemia status. Please specify it explicitly.")

        if self.id_column is None:
            for col in self.df.columns:
                if 'id' in col.lower():
                    self.id_column = col
                    print(f"Detected ID column: {self.id_column}")
                    break

    def load_data(self, data_path=None):
        """Load data and detect columns if necessary"""
        if data_path is not None:
            if data_path.endswith('.csv'):
                self.df = pd.read_csv(data_path)
            elif data_path.endswith('.xlsx'):
                self.df = pd.read_excel(data_path)
            elif data_path.endswith('.json'):
                self.df = pd.read_json(data_path)
            else:
                raise ValueError("Unsupported file format. Please provide a CSV, Excel, or JSON file.")
        else:
            raise ValueError("data_path must be provided")

        # Automatically detect target and ID columns if not provided
        self.auto_detect_columns()

        # Handle missing values
        if self.df.isnull().sum().sum() > 0:
            self.df.fillna(self.df.mean(numeric_only=True), inplace=True)
            self.df.fillna('Unknown', inplace=True)

        return self

    def explore_data(self, max_plots=6):
        """Explore data with automatic feature type detection"""
        print("\nClass Distribution for", self.target_column)
        print(self.df[self.target_column].value_counts(normalize=True))
        
        # Convert target to numerical if it's categorical
        if self.df[self.target_column].dtype == 'object':
            target_encoder = LabelEncoder()
            self.df[self.target_column] = target_encoder.fit_transform(self.df[self.target_column])
            print("\nConverted target values:")
            for i, label in enumerate(target_encoder.classes_):
                print(f"{label} -> {i}")
        
        # Automatically identify numerical and categorical columns
        self.numerical_features = self.df.select_dtypes(
            include=['int64', 'float64']).columns.tolist()
        self.categorical_features = self.df.select_dtypes(
            include=['object', 'category']).columns.tolist()
        
        # Remove target and ID columns from features
        for col in [self.target_column, self.id_column]:
            if col in self.numerical_features:
                self.numerical_features.remove(col)
            if col in self.categorical_features:
                self.categorical_features.remove(col)
        
        print("\nNumerical features:", self.numerical_features)
        print("Categorical features:", self.categorical_features)
        
        # Plot distributions of numerical features
        if len(self.numerical_features) > 0:
            n_plots = min(len(self.numerical_features), max_plots)
            plt.figure(figsize=(15, 10))
            for i, feature in enumerate(self.numerical_features[:n_plots], 1):
                plt.subplot(2, 3, i)
                sns.histplot(data=self.df, x=feature, hue=self.target_column, multiple="stack")
                plt.title(f'Distribution of {feature}')
            plt.tight_layout()
            plt.show()

        # Correlation matrix
        if len(self.numerical_features) > 0:
            numerical_df = self.df[self.numerical_features + [self.target_column]]
            plt.figure(figsize=(12, 8))
            sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', center=0)
            plt.title('Correlation Matrix')
            plt.show()
        
        return self

    def preprocess_data(self, test_size=0.2, random_state=42):
        """Preprocess data with automatic handling of different data types"""
        df_processed = self.df.copy()
        
        # Encode categorical variables
        for feature in self.categorical_features:
            self.label_encoders[feature] = LabelEncoder()
            df_processed[feature] = self.label_encoders[feature].fit_transform(df_processed[feature])
        
        # Prepare features and target
        exclude_cols = [col for col in [self.target_column, self.id_column] if col is not None]
        X = df_processed.drop(exclude_cols, axis=1).values
        y = df_processed[self.target_column].values
        
        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        # Scale the features
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        
        return self

    def select_features(self, n_features=10):
        """Select the top n most important features using Random Forest feature importance."""
        if not hasattr(self, 'X_train_scaled') or not hasattr(self, 'y_train'):
            raise ValueError("Data must be preprocessed before feature selection.")

        # Train a Random Forest model to get feature importances
        temp_model = RandomForestClassifier(n_estimators=100, random_state=42)
        temp_model.fit(self.X_train_scaled, self.y_train)

        # Get feature importances and sort them
        importances = temp_model.feature_importances_
        feature_names = [col for col in self.df.columns if col not in [self.target_column, self.id_column]]
        feature_importance = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

        # Select the top n features
        self.selected_features = [feature[0] for feature in feature_importance[:n_features]]
        print(f"Selected top {n_features} features: {self.selected_features}")

        # Filter training and test data to include only selected features
        selected_indices = [feature_names.index(feature) for feature in self.selected_features]
        self.X_train = self.X_train[:, selected_indices]
        self.X_test = self.X_test[:, selected_indices]

        # Refit the scaler on the selected features
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

        return self

    def train_models(self):
        """Train multiple models with progress updates"""
        self.models = {
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
        }
        
        self.trained_models = {}
        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            model.fit(self.X_train_scaled, self.y_train)
            train_score = model.score(self.X_train_scaled, self.y_train)
            print(f"{name} training accuracy: {train_score:.4f}")
            self.trained_models[name] = model
        
        return self

    def evaluate_models(self):
        """Evaluate models with detailed metrics"""
        self.results = {}
        
        for name, model in self.trained_models.items():
            print(f"\nEvaluating {name}...")
            y_pred = model.predict(self.X_test_scaled)
            y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1]
            
            accuracy = accuracy_score(self.y_test, y_pred)
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            roc_auc = auc(fpr, tpr)
            
            self.results[name] = {
                'accuracy': accuracy,
                'fpr': fpr,
                'tpr': tpr,
                'auc': roc_auc,
                'predictions': y_pred,
                'probabilities': y_pred_proba
            }

        # Plot ROC curves
        plt.figure(figsize=(10, 8))
        for name, metrics in self.results.items():
            plt.plot(metrics['fpr'], metrics['tpr'], 
                    label=f'{name} (AUC = {metrics["auc"]:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for All Models')
        plt.legend()
        plt.show()

        # Print detailed classification reports
        for name, metrics in self.results.items():
            print(f"\nClassification Report for {name}:")
            print(classification_report(self.y_test, metrics['predictions']))

        # Compare model accuracies
        accuracies = {name: metrics['accuracy'] 
                     for name, metrics in self.results.items()}
        plt.figure(figsize=(10, 6))
        plt.bar(accuracies.keys(), accuracies.values())
        plt.title('Model Accuracy Comparison')
        plt.ylabel('Accuracy')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        return self

    def _load_data_from_path(self, data_path):
        """Helper method to load data from a file path."""
        if data_path.endswith('.csv'):
            return pd.read_csv(data_path)
        elif data_path.endswith('.xlsx'):
            return pd.read_excel(data_path)
        elif data_path.endswith('.json'):
            return pd.read_json(data_path)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV, Excel, or JSON file.")

    def predict_unlabeled_data(self, user_input):
        """Predict leukemia status for user-provided input."""
        # Align user input with training features
        aligned_input = [user_input[feature] for feature in self.selected_features]

        # Scale numerical features
        aligned_input_scaled = self.scaler.transform([aligned_input])

        # Use the trained model to predict
        predictions = {name: model.predict(aligned_input_scaled)[0] for name, model in self.trained_models.items()}

        return predictions

# Feature Selection
This cell adds a feature selection step to the model. It uses feature importance from a Random Forest model to select the most relevant features for prediction.

In [None]:
class LeukemiaDetectionModel:
    def select_features(self, n_features=10):
        """Select the top n most important features using Random Forest feature importance."""
        if not hasattr(self, 'X_train_scaled') or not hasattr(self, 'y_train'):
            raise ValueError("Data must be preprocessed before feature selection.")

        # Train a Random Forest model to get feature importances
        temp_model = RandomForestClassifier(n_estimators=100, random_state=42)
        temp_model.fit(self.X_train_scaled, self.y_train)

        # Get feature importances and sort them
        importances = temp_model.feature_importances_
        feature_names = [col for col in self.df.columns if col not in [self.target_column, self.id_column]]
        feature_importance = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

        # Select the top n features
        self.selected_features = [feature[0] for feature in feature_importance[:n_features]]
        print(f"Selected top {n_features} features: {self.selected_features}")

        # Filter training and test data to include only selected features
        selected_indices = [feature_names.index(feature) for feature in self.selected_features]
        self.X_train = self.X_train[:, selected_indices]
        self.X_test = self.X_test[:, selected_indices]

        # Refit the scaler on the selected features
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

        return self

# Run the Leukemia Detection Model with Feature Selection
This cell creates an instance of the `LeukemiaDetectionModel` class, loads the dataset, explores the data, preprocesses it, selects the top features, trains the models, and evaluates their performance.

In [None]:
# Create and run the model with automatic column detection and feature selection
model = LeukemiaDetectionModel()

model.load_data(
    data_path='biased_leukemia_dataset.csv'  # Replace with your data file path
)

model.explore_data()

model.preprocess_data(test_size=0.2)

# Select the top 10 features
model.select_features(n_features=10)

model.train_models()

model.evaluate_models()

# Predict Leukemia Status with Selected Features
This cell allows the user to input data for only the selected features and predicts whether the person has leukemia or not using the trained models.

In [7]:
# Predict leukemia status for a new dataset using user input with selected features
def get_user_input():
    """Prompt user for input with feature ranges and return a dictionary."""
    print("\nPlease provide the following inputs for the selected features:")
    user_data = {}
    for feature in model.selected_features:
        if feature in model.numerical_features:
            min_val = model.df[feature].min()
            max_val = model.df[feature].max()
            value = float(input(f"Enter value for {feature} (Range [{min_val}, {max_val}]): "))
        elif feature in model.categorical_features:
            unique_vals = model.df[feature].unique()
            value = input(f"Enter value for {feature} (Possible values {list(unique_vals)}): ")
            # Encode categorical input using the label encoder
            if feature in model.label_encoders:
                value = model.label_encoders[feature].transform([value])[0]
        user_data[feature] = value
    return user_data

# Get user input and make predictions
user_input = get_user_input()

# Align user input with selected features
aligned_input = [user_input[feature] for feature in model.selected_features]

# Scale the input and make predictions
aligned_input_scaled = model.scaler.transform([aligned_input])
predictions = {name: model.predict(aligned_input_scaled)[0] for name, model in model.trained_models.items()}

# Convert predictions to human-readable labels
for model_name, prediction in predictions.items():
    result = "Has Leukemia" if prediction == 1 else "Does Not Have Leukemia"
    print(f"\nPrediction from {model_name}: {result}")


Please provide the following inputs for the selected features:

Prediction from Logistic Regression: Does Not Have Leukemia

Prediction from Random Forest: Does Not Have Leukemia

Prediction from Logistic Regression: Does Not Have Leukemia

Prediction from Random Forest: Does Not Have Leukemia
