In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import streamlit as st
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv('ALL_Dataset.csv')

In [4]:
dataset

Unnamed: 0,Patient_ID,Age,Gender,WBC_Count,RBC_Count,Hemoglobin,Platelet_Count,Lymphoblast_Percentage,Chromosomal_Abnormalities,ALL_Diagnosis
0,P0001,67,Female,28.315639,4.905376,9.996758,384.925533,41.598043,1,0
1,P0002,14,Female,45.604898,5.059828,10.449591,123.961708,28.460584,1,1
2,P0003,41,Male,20.967083,4.017812,14.948566,423.604991,25.281562,0,1
3,P0004,62,Female,27.441230,4.224894,14.177886,205.402728,34.547750,0,0
4,P0005,81,Female,5.408733,5.891136,14.413302,104.251822,24.173599,1,0
...,...,...,...,...,...,...,...,...,...,...
995,P0996,42,Male,25.303052,6.843115,12.736860,434.882678,33.284171,0,0
996,P0997,29,Female,21.664727,5.596308,9.867411,451.181926,23.224993,1,1
997,P0998,79,Female,44.575616,6.638084,8.966194,286.923751,31.365959,1,0
998,P0999,73,Male,39.043554,6.866579,12.080743,465.004246,35.301371,1,1


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Patient_ID                 1000 non-null   object 
 1   Age                        1000 non-null   int64  
 2   Gender                     1000 non-null   object 
 3   WBC_Count                  1000 non-null   float64
 4   RBC_Count                  1000 non-null   float64
 5   Hemoglobin                 1000 non-null   float64
 6   Platelet_Count             1000 non-null   float64
 7   Lymphoblast_Percentage     1000 non-null   float64
 8   Chromosomal_Abnormalities  1000 non-null   int64  
 9   ALL_Diagnosis              1000 non-null   int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 78.3+ KB


In [10]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check for missing values
missing_values = dataset.isnull().sum()

# Visualize the missing data
plt.figure(figsize=(10, 6))
sns.heatmap(dataset.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values in Dataset")

# Save the plot as an image file
plot_filename = "missing_values_heatmap.png"
plt.savefig(plot_filename, dpi=300, bbox_inches='tight')

# Notify the user where the plot is saved
print(f"Missing values heatmap saved as: {plot_filename}")

# Display missing value counts
print("\nMissing values in each column:")
print(missing_values)


Missing values heatmap saved as: missing_values_heatmap.png

Missing values in each column:
Patient_ID                   0
Age                          0
Gender                       0
WBC_Count                    0
RBC_Count                    0
Hemoglobin                   0
Platelet_Count               0
Lymphoblast_Percentage       0
Chromosomal_Abnormalities    0
ALL_Diagnosis                0
dtype: int64


In [14]:
import matplotlib.pyplot as plt
import seaborn as sns

# Helper function to save and display plots
def save_and_display_plot(filename):
    plt.savefig(filename, dpi=300, bbox_inches='tight')  # Save the plot
    print(f"Plot saved as: {filename}")
    plt.close()  # Close the figure to release memory

# Gender distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Gender', hue='ALL_Diagnosis', data=dataset, palette='viridis')
plt.title("Gender Distribution by ALL Diagnosis")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.legend(title="ALL Diagnosis", labels=["Negative", "Positive"])
save_and_display_plot("gender_distribution.png")

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = dataset.drop(columns=["Patient_ID", "Gender"]).corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
save_and_display_plot("correlation_heatmap.png")



Plot saved as: gender_distribution.png
Plot saved as: correlation_heatmap.png


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Encoding categorical variable Gender
label_encoder = LabelEncoder()
dataset['Gender'] = label_encoder.fit_transform(dataset['Gender'])

# Splitting data into features (X) and target (y)
X = dataset.drop(columns=['Patient_ID', 'ALL_Diagnosis'])
y = dataset['ALL_Diagnosis']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Making predictions
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
classification_report_result = classification_report(y_test, y_pred)

# Display metrics in a formatted way
print(f"Accuracy: {accuracy:.3f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report_result)

# Visualizing Confusion Matrix
plt.figure(figsize=(8, 6))  # Adjust figure size
disp = ConfusionMatrixDisplay.from_estimator(
    rf_model, X_test, y_test, display_labels=["Negative", "Positive"], cmap="Blues"
)
disp.ax_.set_title("Confusion Matrix")  # Set title for the plot

# Save or display plot depending on the environment
try:
    plt.tight_layout()  # Adjust layout to fit everything
    plt.show()
except Exception as e:
    print(f"Unable to display plot interactively: {e}. Saving plot as 'confusion_matrix.png'.")
    plt.savefig("confusion_matrix.png")


Accuracy: 0.445
ROC AUC Score: 0.4532

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.41      0.42        98
           1       0.46      0.48      0.47       102

    accuracy                           0.45       200
   macro avg       0.44      0.44      0.44       200
weighted avg       0.44      0.45      0.44       200



  plt.show()


In [21]:
print(y.value_counts())


ALL_Diagnosis
1    520
0    480
Name: count, dtype: int64
