In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression,  LogisticRegressionCV, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib

In [None]:
df = pd.read_excel('/content/HealthCareData.xlsx')
pd.set_option('display.max_columns', None)

In [None]:
def clean_dataset(df):
    # Standardize column names
    df.columns = df.columns.str.strip().str.replace(r"\s+", " ", regex=True)

    # Define columns to drop explicitly
    columns_to_drop = [
        "S.NO",
        "Type of alcohol consumed"
    ]

    # Drop explicitly unwanted columns (if they exist)
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Drop columns with more than 50% missing values
    missing_threshold = 0.5
    df = df.loc[:, df.isnull().mean() <= missing_threshold]

    # Remove rows of Label null values
    df = df.dropna(subset=["Predicted Value(Out Come-Patient suffering from liver cirrosis or not)"])

    # Drop duplicates
    df = df.drop_duplicates()

    if 'Blood pressure (mmhg)' in df.columns:
      bp_split = df['Blood pressure (mmhg)'].str.split('/', expand=True)
      df['BP_Systolic'] = pd.to_numeric(bp_split[0], errors='coerce')
      df['BP_Diastolic'] = pd.to_numeric(bp_split[1], errors='coerce')
      df.drop(columns=['Blood pressure (mmhg)'], inplace=True)

    return df

In [None]:
df.shape

In [None]:
df = clean_dataset(df)
df = df.rename(columns={
    "Predicted Value(Out Come-Patient suffering from liver cirrosis or not)": "Target"
})

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
print(df["Target"].value_counts())

In [None]:
df.shape

In [None]:
df.isnull().mean()*100

In [None]:
def handle_missing_values(df):
    df = df.copy()  # Avoid SettingWithCopyWarning

    # Separate numeric and categorical columns
    numeric_cols = df.select_dtypes(include=["number"]).columns
    categorical_cols = df.select_dtypes(include=["object", "category"]).columns

    # Impute numeric columns with median
    if len(numeric_cols) > 0:
        median_imputer = SimpleImputer(strategy="median")
        df[numeric_cols] = pd.DataFrame(
            median_imputer.fit_transform(df[numeric_cols]),
            columns=numeric_cols,
            index=df.index
        )

    # Impute categorical columns with mode
    if len(categorical_cols) > 0:
        mode_imputer = SimpleImputer(strategy="most_frequent")
        df[categorical_cols] = pd.DataFrame(
            mode_imputer.fit_transform(df[categorical_cols]),
            columns=categorical_cols,
            index=df.index
        )

    return df

In [None]:
df = handle_missing_values(df)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
def label_encode_binary_columns(df, columns_to_encode):
    """
    Replace categorical values with numerical values and avoid FutureWarning.

    """
    mapping = {
        'YES': 1, 'NO': 0,
        'POSITIVE': 1, 'NEGATIVE': 0,
        'MALE': 1, 'FEMALE': 0,
        'RURAL': 1, 'URBAN': 0
    }

    for col in columns_to_encode:
        if col in df.columns:
            # Use Series.map() instead of replace() to avoid downcasting warning
            df[col] = df[col].astype(str).str.strip().str.upper().map(mapping).astype('float64')
    return df

In [None]:
binary_cols = [
    "Hepatitis B infection",
    "Hepatitis C infection",
    "Diabetes Result",
    "Obesity",
    "Place(location where the patient lives)",
    "USG Abdomen (diffuse liver or not)",
    "Target",
    "Family history of cirrhosis/ hereditary",
    "Gender"
]

df = label_encode_binary_columns(df, binary_cols)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
def convert_object_columns_to_numeric(df):
    """
    Convert object-type columns to numeric where possible.
    Invalid parsing will result in NaN (and can be handled after).
    """
    obj_cols = df.select_dtypes(include='object').columns
    for col in obj_cols:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        except Exception as e:
            print(f"⚠️ Column '{col}' could not be fully converted: {e}")
    df = handle_missing_values(df)
    return df

In [None]:
df = convert_object_columns_to_numeric(df)

In [None]:
df.head()

In [None]:
# Creating function for outliers detection
def detect_outliers_iqr(df, features, threshold=1.5):
    outlier_indices = {}

    for col in features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outlier_indices[col] = outliers.tolist()

        print(f"📌 {col}: {len(outliers)} outliers")

    return outlier_indices

In [None]:
# outliers detection
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
outlier_dict = detect_outliers_iqr(df, numeric_features)

In [None]:
# Function to cap outliers
def cap_outliers(df, features, threshold=1.5):
    for col in features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    return df

In [None]:
# Capping outliers
# Get numeric features excluding the target column
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_features = [col for col in numeric_features if col != "Target"]

# Apply outlier capping only on features
df = cap_outliers(df, numeric_features)

In [None]:
# Split Features and Target
X = df.drop(columns=["Target"])
y = df["Target"]

In [None]:
# function for feature selection
def select_features_by_importance(X, y, top_n=20, random_state=42):
    """
    Select top N important features using Random Forest feature importances.

    Parameters:
    - X: DataFrame of features
    - y: Series or array-like target
    - top_n: number of top features to select (default: 20)
    - random_state: reproducibility

    Returns:
    - DataFrame containing only the top N important features
    """
    rf = RandomForestClassifier(random_state=random_state)
    rf.fit(X, y)

    # Create a series of importances
    importances = pd.Series(rf.feature_importances_, index=X.columns)

    # Sort and get top N
    top_features = importances.sort_values(ascending=False).head(top_n).index

    # Return filtered DataFrame
    return X[top_features]


In [None]:
# selecting top important features
X_selected = select_features_by_importance(X, y, top_n=20)

In [None]:
X_selected

In [None]:
# Split data with StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in sss.split(X_selected, y):
    X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit only on training data
X_test_scaled = scaler.transform(X_test)

In [None]:
y_test.value_counts()

In [None]:
# Resampling to avoid class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# Comparing multiple algorithms
def models_eval_mm(X_test_scaled, y_test, X_resampled, y_resampled):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42),
        "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
        "SVM": SVC(probability=True, random_state=42),
        "Logistic Regression CV": LogisticRegressionCV(cv=5, max_iter=1000),
        "Ridge Classifier": RidgeClassifier(),
        "KNN": KNeighborsClassifier()
    }

    results = []

    for name, model in models.items():
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else None

        print(f"\n🔷 Model: {name}")
        print(f"🔹 Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
        print(f"🔹 Precision: {precision_score(y_test, y_pred):.4f}")
        print(f"🔹 Recall:    {recall_score(y_test, y_pred):.4f}")
        print(f"🔹 F1 Score:  {f1_score(y_test, y_pred):.4f}")
        if y_proba is not None:
            try:
                print(f"🔹 ROC AUC:   {roc_auc_score(y_test, y_proba):.4f}")
            except:
                print("⚠️ ROC AUC could not be calculated.")

        print(f"\n🔸 Classification Report:\n{classification_report(y_test, y_pred)}")

        results.append({
            "Model": name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "F1 Score": f1_score(y_test, y_pred),
            "ROC AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None
        })

    return results


In [None]:
results = models_eval_mm(X_test_scaled, y_test, X_resampled, y_resampled)

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Define your models again properly in a dictionary
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Logistic Regression CV": LogisticRegressionCV(cv=5, max_iter=1000),
    "Ridge Classifier": RidgeClassifier(),
    "KNN": KNeighborsClassifier()
}

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate models using F1 score
for name, model in models.items():
    scores = cross_val_score(model, X_selected, y, scoring='f1', cv=skf)
    print(f"🔷 {name}: Mean F1 Score (CV): {scores.mean():.4f}")


In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the base model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='f1',   # or use 'roc_auc', 'accuracy', etc. based on your goal
    n_jobs=-1,
    verbose=2
)

# Fit the model
grid_search.fit(X_resampled, y_resampled)

# Best model
best_rf = grid_search.best_estimator_

In [None]:
# Print best parameters
print("✅ Best Parameters:", grid_search.best_params_)

In [None]:
# Make predictions on test set
y_pred = best_rf.predict(X_test_scaled)
y_proba =best_rf.predict_proba(X_test_scaled)[:, 1]

In [None]:
# 2. Evaluation Metrics
print("🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("🔹 Precision:", precision_score(y_test, y_pred))
print("🔹 Recall:", recall_score(y_test, y_pred))
print("🔹 F1 Score:", f1_score(y_test, y_pred))
print("🔹 ROC AUC Score:", roc_auc_score(y_test, y_proba))

In [None]:
# 3. Confusion Matrix & Classification Report
print("\n🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Cross Validation
scores = cross_val_score(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring='roc_auc')
print("Mean AUC (CV):", scores.mean())

In [None]:
# Mannual Testing
sample_input = pd.DataFrame([{
    "Total Bilirubin (mg/dl)": 7,
    "Duration of alcohol consumption(years)": 12,
    "Direct (mg/dl)": 4,
    "AL.Phosphatase (U/L)": 150,
    "Platelet Count (lakhs/mm)": 1.5,
    "Indirect (mg/dl)": 3,
    "Polymorphs (%)": 60,
    "Albumin (g/dl)": 3,
    "PCV (%)": 40,
    "SGOT/AST (U/L)": 56,
    "Lymphocytes (%)": 35,
    "Age": 55,
    "Monocytes (%)": 2,
    "BP_Systolic": 138,
    "Hemoglobin (g/dl)": 9.2,
    "Quantity of alcohol consumption (quarters/day)": 2,
    "Diabetes Result": 1,
    "Total Protein (g/dl)": 6,
    "SGPT/ALT (U/L)": 34,
    "Globulin (g/dl)": 4
}])
sample_input2 = pd.DataFrame([{
    "Total Bilirubin (mg/dl)": 0.3,
    "Duration of alcohol consumption(years)": 8,
    "Direct (mg/dl)": 1,
    "AL.Phosphatase (U/L)": 56,
    "Platelet Count (lakhs/mm)": 3,
    "Indirect (mg/dl)": 2,
    "Polymorphs (%)": 60,
    "Albumin (g/dl)": 4.2,
    "PCV (%)": 36,
    "SGOT/AST (U/L)": 110,
    "Lymphocytes (%)": 20,
    "Age": 52,
    "Monocytes (%)": 2,
    "BP_Systolic": 110,
    "Hemoglobin (g/dl)": 13,
    "Quantity of alcohol consumption (quarters/day)": 3,
    "Diabetes Result": 0,
    "Total Protein (g/dl)": 7.1, #
    "SGPT/ALT (U/L)": 70,
    "Globulin (g/dl)": 2.5
}])

In [None]:
sample_scaled = scaler.transform(sample_input2)

In [None]:
# Predict
prediction = best_rf.predict(sample_scaled)
probability = best_rf.predict_proba(sample_scaled)

# Extract confidence (probability of the predicted class)
predicted_class = int(prediction[0])
confidence = probability[0][predicted_class] * 100

# Display results
print(f"Predicted Class: {predicted_class}")
print(f"Confidence: {confidence:.1f}%")


In [None]:
# Save the Model
joblib.dump(best_rf, 'rf_acc_68.pkl')
joblib.dump(scaler, 'normalizer.pkl')