# Final Project: Foundations of AI in Healthcare

In [11]:
# Import required libraries
import pandas as pd

# Define the filename

filename = "indian_liver_patient_dataset.xlsx" 

# Load the dataset
info_patients = pd.read_excel(filename)
print("✅ Dataset loaded successfully!")

# Display basic information
print(f"Dataset contains {len(info_patients)} patient records.")
print(f"Dataset contains {len(info_patients.columns)} features per patient.")


# === DATASET INFORMATION ===
print("\n=== DATASET INFORMATION ===")
print(f"Total number of patients: {len(info_patients)}")

# Count features and liver disease cases
print(f"Number of features per patient: {len(info_patients.columns) - 1}")

# Count patients WITH liver disease (Outcome = 1)
liver_disease_cases = sum(info_patients['Outcome'] == 1)
print(f"Number of patients with Liver Disease: {liver_disease_cases}")

# Display basic statistics
print("\n=== BASIC STATISTICS FOR ALL FEATURES ===")
display(info_patients.describe())

✅ Dataset loaded successfully!
Dataset contains 583 patient records.
Dataset contains 10 features per patient.

=== DATASET INFORMATION ===
Total number of patients: 583
Number of features per patient: 9
Number of patients with Liver Disease: 416

=== BASIC STATISTICS FOR ALL FEATURES ===


Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Outcome
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947084,0.713551
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319704,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,0.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,0.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,1.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,1.0


In [12]:
# Import the function to split the data
from sklearn.model_selection import train_test_split

# === IDENTIFYING FEATURES AND TARGET ===

# Target variable (y): What we want to predict (Outcome)
y = info_patients['Outcome']

# Input features (X): The medical data used for prediction
# We drop 'Outcome' because that's the answer, not a feature
X = info_patients.drop(columns=['Outcome'])

# Split data into Training (80%) and Testing (20%) sets
# random_state=42 ensures we get the same split every time we run the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print("=== DATA SPLIT INFORMATION ===")
print(f"X (Features) shape: {X.shape}")
print(f"y (Target) shape: {y.shape}")
print(f"\nTraining set size: {len(X_train)} patients")
print(f"Testing set size: {len(X_test)} patients")

print("\n=== FEATURES (X) PREVIEW ===")
print(X.columns.tolist())

=== DATA SPLIT INFORMATION ===
X (Features) shape: (583, 9)
y (Target) shape: (583,)

Training set size: 466 patients
Testing set size: 117 patients

=== FEATURES (X) PREVIEW ===
['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio']


In [13]:
from sklearn.linear_model import LogisticRegression

# === MODEL SELECTION AND TRAINING ===

# 1. Initialize the model
# We use max_iter=1000 to ensure the model has enough time to find the solution
print("Selected Model: Logistic Regression")
model = LogisticRegression(max_iter=1000, random_state=42)

# 2. Train the model using the training data
print("Training the model...")
model.fit(X_train, y_train)
print("✅ Model trained successfully!")

# This helps us understand which medical features are most important
coefficients = model.coef_[0]
feature_names = X.columns

# Create a DataFrame to display feature importance
weights_df = pd.DataFrame({
    'Feature': feature_names,
    'Weight (Importance)': coefficients
})

# Sort by absolute weight to see the most influential factors
weights_df['Abs_Weight'] = weights_df['Weight (Importance)'].abs()
weights_df = weights_df.sort_values(by='Abs_Weight', ascending=False).drop(columns=['Abs_Weight'])

print("\n=== FEATURE IMPORTANCE (MODEL WEIGHTS) ===")
print(weights_df)

Selected Model: Logistic Regression
Training the model...
✅ Model trained successfully!

=== FEATURE IMPORTANCE (MODEL WEIGHTS) ===
                      Feature  Weight (Importance)
7                     Albumin            -0.787703
8  Albumin_and_Globulin_Ratio             0.494416
6              Total_Protiens             0.493515
2            Direct_Bilirubin             0.420493
0                         Age             0.015138
1             Total_Bilirubin             0.011609
4    Alamine_Aminotransferase             0.008041
5  Aspartate_Aminotransferase             0.002948
3        Alkaline_Phosphotase             0.001395


In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# === MODEL EVALUATION ===

# 1. Make predictions on the Test set 
y_pred = model.predict(X_test)

# 2. Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# 3. Create Confusion Matrix to calculate Sensitivity and Specificity
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Sensitivity (Recall): Ability to correctly detect sick patients
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

# Specificity: Ability to correctly identify healthy patients
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

# 4. Display Results
print("\n=== MODEL PERFORMANCE METRICS ===")
print(f"Accuracy:    {accuracy:.2f} ({accuracy*100:.1f}%)")
print(f"Sensitivity: {sensitivity:.2f} ({sensitivity*100:.1f}%)")
print(f"Specificity: {specificity:.2f} ({specificity*100:.1f}%)")

print("\n=== CONFUSION MATRIX ===")
print(f"True Positives (Disease correctly detected): {tp}")
print(f"True Negatives (Healthy correctly identified): {tn}")
print(f"False Positives (False Alarm - Healthy classified as Sick): {fp}")
print(f"False Negatives (Missed Case - Sick classified as Healthy): {fn}")


=== MODEL PERFORMANCE METRICS ===
Accuracy:    0.75 (75.2%)
Sensitivity: 0.91 (90.8%)
Specificity: 0.30 (30.0%)

=== CONFUSION MATRIX ===
True Positives (Disease correctly detected): 79
True Negatives (Healthy correctly identified): 9
False Positives (False Alarm - Healthy classified as Sick): 21
False Negatives (Missed Case - Sick classified as Healthy): 8
