In [1]:
# 📌 Step 1: Import Required Libraries

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
# 📌 Step 2: Load Dataset

# Load the Indian Liver Patient dataset
df = pd.read_csv("indian_liver_patient.csv")

# Standardize column names: remove spaces, convert to lowercase
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Preview the data
df.head()


Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_protiens,albumin,albumin_and_globulin_ratio,dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
# 📌 Step 3: Initial Inspection – Missing Values and Target Column

# Show dataset info and null counts
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         583 non-null    int64  
 1   gender                      583 non-null    object 
 2   total_bilirubin             583 non-null    float64
 3   direct_bilirubin            583 non-null    float64
 4   alkaline_phosphotase        583 non-null    int64  
 5   alamine_aminotransferase    583 non-null    int64  
 6   aspartate_aminotransferase  583 non-null    int64  
 7   total_protiens              583 non-null    float64
 8   albumin                     583 non-null    float64
 9   albumin_and_globulin_ratio  579 non-null    float64
 10  dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


age                           0
gender                        0
total_bilirubin               0
direct_bilirubin              0
alkaline_phosphotase          0
alamine_aminotransferase      0
aspartate_aminotransferase    0
total_protiens                0
albumin                       0
albumin_and_globulin_ratio    4
dataset                       0
dtype: int64

In [5]:
# 📌 Step 5: Encode Categorical Variables and Handle Missing Categorical Data

# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()

# Encode each categorical column
for col in categorical_cols:
    df[col] = df[col].astype(str).str.strip().replace('?', pd.NA)  # Clean up and replace unknowns
    df[col] = df[col].fillna(df[col].mode()[0])                    # Fill missing with mode
    df[col] = le.fit_transform(df[col])                            # Label encode

df.head()


Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_protiens,albumin,albumin_and_globulin_ratio,dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [6]:
# 📌 Step 6: Handle Missing Numeric Values and Feature Scaling

# Select numerical columns excluding the target
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('target', errors='ignore')

# Fill missing numeric values with the median
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# Scale numerical features using StandardScaler
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [7]:
# 📌 Step 7:  Save the Cleaned Dataset

# Save the cleaned dataset for model training
df.to_csv("liver_cleaned.csv", index=False)
print("✅ Saved cleaned dataset as 'liver_cleaned.csv'")


✅ Saved cleaned dataset as 'liver_cleaned.csv'


In [8]:
# 📊 Step 8: Univariate Analysis – Age Distribution


fig = px.histogram(
    df,
    x='age',
    nbins=30,
    title="Age Distribution of Patients",
    template="plotly_white",
    color_discrete_sequence=['teal']  # Optional: better color
)

fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Count"
)

fig.show()


In [11]:
# 📊 Step 9: Univariate Analysis – Liver Disease Class Distribution
fig = px.histogram(
    df,
    x='dataset',   # ✅ Correct column name for liver disease target
    color='dataset',
    title="Liver Disease Cases (Target Variable)",
    labels={'dataset': 'Liver Disease (1 = Yes, 2 = No)'},
    template="plotly_white",
    color_discrete_map={1: 'tomato', 2: 'lightgreen'}
)

fig.update_layout(
    xaxis_title="Liver Disease Presence",
    yaxis_title="Count",
    showlegend=False
)

fig.show()


In [12]:
# 📊 Step 10: Bivariate Analysis – Total Bilirubin vs Liver Disease Status

fig = px.box(
    df,
    x='dataset',                      # ✅ Use 'dataset' instead of 'target'
    y='total_bilirubin',
    points='all',
    title="Total Bilirubin vs Liver Disease Status",
    color='dataset',
    template="plotly_white",
    color_discrete_map={1: 'tomato', 2: 'lightgreen'}
)

fig.update_layout(
    xaxis_title="Liver Disease Status (1 = Disease, 2 = No Disease)",
    yaxis_title="Total Bilirubin",
    showlegend=False
)

fig.show()


In [14]:
# 📊 Step 11: Bivariate Analysis – Correlation with Target (Liver Disease)

df['target'] = df['dataset'].map({1: 1, 2: 0})  # Only run once

corr = df.corr(numeric_only=True)['target'].drop('target').sort_values(ascending=False)

fig = px.bar(
    x=corr.index,
    y=corr.values,
    title="Feature Correlation with Target (Liver Disease)",
    labels={'x': 'Features', 'y': 'Correlation'},
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Correlation with Liver Disease"
)

fig.show()


In [15]:
# 📊 Step 12: Multivariate Analysis – Correlation Heatmap

# Select only numeric columns
numeric_df = df.select_dtypes(include='number')

# Compute correlation
corr_matrix = numeric_df.corr()

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='Viridis'
))

fig.update_layout(
    title="Correlation Heatmap of Numerical Features",
    xaxis_nticks=len(corr_matrix.columns)
)

fig.show()


In [16]:
# 📊 Step 13: Multivariate Analysis – Scatter Matrix of Selected Liver Features

# Ensure 'target' column is available
df['target'] = df['dataset'].map({1: 1, 2: 0})

# Selected numeric features
selected = ['age', 'total_bilirubin', 'alkaline_phosphotase', 'target']

# Create scatter matrix
fig = px.scatter_matrix(
    df[selected],
    color='target',
    title="Scatter Matrix of Selected Liver Features",
    dimensions=selected,
    template="plotly_white",
    labels={'target': 'Liver Disease (0 = No, 1 = Yes)'}
)

# Hide diagonal histograms (optional for cleaner view)
fig.update_traces(diagonal_visible=False)

fig.show()


In [18]:
# 📊 Step 14: Other Analysis – Gender Distribution

# Convert gender column to string before applying string methods
df['gender'] = df['gender'].astype(str).str.strip().str.lower()

# Optional: Replace known variations to standard labels
df['gender'] = df['gender'].replace({
    'm': 'male', 'f': 'female',
    'male': 'male', 'female': 'female'
})

# Plot
fig = px.histogram(
    df,
    x='gender',
    title="Gender Distribution",
    labels={'gender': 'Gender'},
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Gender",
    yaxis_title="Count"
)

fig.show()
