## Dataset 2 "Pixels"

In [1]:
#pip install imblern
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer # Import SimpleImputer
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder


# Load data
df = pd.read_csv("pixel_Anaemia.csv")



In [2]:
df.head()

Unnamed: 0,Number,Sex,red_pixel,green_pixel,blue_pixel,hemoglobin,Anaemic
0,1,M,43.2555,30.8421,25.9025,6.3,Yes
1,2,F,45.6033,28.19,26.2067,13.5,No
2,3,F,45.0107,28.9677,26.0215,11.7,No
3,4,F,44.5398,28.9899,26.4703,13.5,No
4,5,M,43.287,30.6972,26.0158,12.4,No


In [3]:
# ======================
# DATA PREPROCESSING
# ======================
# 1. Remove suspicious columns that may cause data leakage
df = df.drop(['Number'], axis=1)  # Remove patient ID
# Note: Keep 'Hb' as it's clinically relevant for anemia detection

# 2. Convert categorical variables
df['Sex'] = df['Sex'].map({'M': 1, 'F': 0})

# 3. Handle class imbalance
X = df.drop('Anaemic', axis=1)
y = df['Anaemic']

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean') # Create an imputer instance
X = imputer.fit_transform(X) # Fit and transform the data

# Apply SMOTE oversampling
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Convert 'Yes' and 'No' to 1 and 0 in y_res
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Fit the encoder to the unique values in y_res and transform y_res
y_res = label_encoder.fit_transform(y_res)





In [4]:


# ======================
# MODEL TRAINING
# ======================

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}




In [5]:

# Evaluate using stratified 5-fold cross-validation
results = {}
for name, model in models.items():
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X_res, y_res, cv=cv, scoring='f1')
    results[name] = {
        'F1 Mean': np.mean(scores),
        'F1 Std': np.std(scores)
    }


In [6]:

# ======================
# FINAL MODEL (XGBoost)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train best model
best_model = XGBClassifier(random_state=42)
best_model.fit(X_train, y_train)

# Evaluate
y_pred = best_model.predict(X_test)
print("\n=== Final Model Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



=== Final Model Performance ===
Accuracy: 0.97
F1 Score: 0.97
ROC AUC: 0.97

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       1.00      0.94      0.97        18

    accuracy                           0.97        32
   macro avg       0.97      0.97      0.97        32
weighted avg       0.97      0.97      0.97        32



In [7]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Initialize models
models = {
    "Support Vector Machine": SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
    # "CatBoost": CatBoostClassifier(random_state=42, verbose=0)  # verbose=0 to suppress output
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred)
    }

    print(f"\n=== {name} Performance ===")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


=== Support Vector Machine Performance ===
Accuracy: 0.91
F1 Score: 0.92
ROC AUC: 0.90

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.86      0.89        14
           1       0.89      0.94      0.92        18

    accuracy                           0.91        32
   macro avg       0.91      0.90      0.90        32
weighted avg       0.91      0.91      0.91        32


=== Decision Tree Performance ===
Accuracy: 0.94
F1 Score: 0.94
ROC AUC: 0.94

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        14
           1       0.94      0.94      0.94        18

    accuracy                           0.94        32
   macro avg       0.94      0.94      0.94        32
weighted avg       0.94      0.94      0.94        32


=== Logistic Regression Performance ===
Accuracy: 0.91
F1 Score: 0.91
ROC AUC: 0.91

Classification Report:
              precisio

In [8]:
import joblib

# Save the trained best XGBoost model to a file
joblib.dump(models["XGBoost"], "KOMAIL_pixels_best_xgboost_model.pkl")
print("Model saved as ppixels_best_xgboost_model.pkl")


Model saved as ppixels_best_xgboost_model.pkl


# Deep Learning models

In [9]:
!pip install tensorflow keras scikit-learn



You should consider upgrading via the 'C:\Users\komik\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Bidirectional, GRU, Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split # Import train_test_split


# Reshape the data for LSTM, BiLSTM, GRU, and CNN-LSTM
# We will use a time step of 1, but experiment with different values if your data has inherent time dependencies
X_res_reshaped = X_res.reshape(X_res.shape[0], 1, X_res.shape[1])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res_reshaped, y_res, test_size=0.2, random_state=42)


def create_lstm_model():
    model = Sequential()
    model.add(LSTM(units=50, activation='relu', input_shape=(X_res_reshaped.shape[1], X_res_reshaped.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_bilstm_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(units=50, activation='relu'), input_shape=(X_res_reshaped.shape[1], X_res_reshaped.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_gru_model():
    model = Sequential()
    model.add(GRU(units=50, activation='relu', input_shape=(X_res_reshaped.shape[1], X_res_reshaped.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_cnn_lstm_model():
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=1, activation='relu', input_shape=(X_res_reshaped.shape[1], X_res_reshaped.shape[2])))
    model.add(MaxPooling1D(pool_size=1))
    model.add(LSTM(units=50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

models = {
    "LSTM": create_lstm_model(),
    "BiLSTM": create_bilstm_model(),
    "GRU": create_gru_model(),
    "CNN-LSTM": create_cnn_lstm_model()
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)  # Train on the training data
    y_pred = (model.predict(X_test) > 0.5).astype(int)  # Predict on the test data

    print(f"\n=== {name} Performance ===")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training LSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step

=== LSTM Performance ===
Accuracy: 0.94
F1 Score: 0.94
ROC AUC: 0.94

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       1.00      0.89      0.94        18

    accuracy                           0.94        32
   macro avg       0.94      0.94      0.94        32
weighted avg       0.95      0.94      0.94        32

Training BiLSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step

=== BiLSTM Performance ===
Accuracy: 0.94
F1 Score: 0.95
ROC AUC: 0.93

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.90      1.00      0.95        18

    accuracy                           0.94        32
   macro avg       0.95      0.93      0.94        32
weighted avg       0.94      0

In [11]:
# 1. Create the model
model = create_lstm_model()

# 2. Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# 3. Predict on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)

# 4. Print accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Accuracy: 0.94
