In [None]:
# Liver Cirrhosis Prediction Model Training (Final Version)

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report
import pickle



In [18]:
# Step 2: Load the dataset
df = pd.read_csv("D:\LIVER_CIRRHOSIS_PREDICTION_PROJECT\Data\liver_data.csv")
print("Original shape:", df.shape)

Original shape: (950, 42)


In [19]:
selected_features = [
    'Age', 'Gender', 'Duration of alcohol consumption(years)',
    'Quantity of alcohol consumption (quarters/day)',
    'Hepatitis B infection', 'Hepatitis C infection', 'Diabetes Result',
    'Blood pressure (mmhg)', 'Obesity', 'Family history of cirrhosis/ hereditary',
    'TCH', 'TG', 'LDL', 'HDL', 'Hemoglobin  (g/dl)', 'PCV  (%)',
    'MCV   (femtoliters/cell)', 'Total Count', 'Polymorphs  (%) ',
    'Lymphocytes  (%)', 'Monocytes   (%)', 'Eosinophils   (%)',
    'Basophils  (%)', 'Platelet Count  (lakhs/mm)',
    'Total Bilirubin    (mg/dl)', 'Direct    (mg/dl)', 'Indirect     (mg/dl)',
    'Total Protein     (g/dl)', 'Albumin   (g/dl)', 'Globulin  (g/dl)',
    'A/G Ratio', 'AL.Phosphatase      (U/L)', 'SGOT/AST      (U/L)',
    'SGPT/ALT (U/L)'
]

target_column = 'Predicted Value(Out Come-Patient suffering from liver  cirrosis or not)'

In [20]:
# Step 4: Keep only selected features + target
df = df[selected_features + [target_column]]

In [21]:
# Step 5: Fill all blanks or invalid strings
df = df.applymap(lambda x: str(x).strip().lower() if isinstance(x, str) else x)

  df = df.applymap(lambda x: str(x).strip().lower() if isinstance(x, str) else x)


In [22]:
# Step 6: Convert categorical values
binary_map = {
    'male': 1, 'female': 0,
    'yes': 1, 'no': 0,
    'positive': 1, 'negative': 0
}

for col in df.columns:
    df[col] = df[col].replace(binary_map)

  df[col] = df[col].replace(binary_map)


In [23]:
# Step 7: Convert blood pressure like "120/80" to just systolic value (e.g., 120)
def convert_bp(val):
    try:
        if '/' in str(val):
            return float(val.split('/')[0])
        return float(val)
    except:
        return 0

df['Blood pressure (mmhg)'] = df['Blood pressure (mmhg)'].apply(convert_bp)

In [24]:
# Step 8: Remove non-numeric junk like '130LDL', 'o.4' etc.
def extract_float(val):
    try:
        val = str(val).replace('o', '0').replace(':', '/')
        return float(''.join(c for c in val if c.isdigit() or c == '.' or c == '-'))
    except:
        return 0

for col in selected_features:
    df[col] = df[col].apply(extract_float)

In [25]:
# Step 9: Replace missing/blank target values
df[target_column] = df[target_column].replace(binary_map)
df = df[df[target_column].isin([0, 1])]  # Keep only 0 and 1

In [26]:
# Step 10: Fill missing values
df.fillna(0, inplace=True)

In [27]:
# Step 11: Separate features and target
X = df[selected_features]
y = df[target_column]

print("✅ Cleaned data shape:", X.shape)
print("✅ Any NaNs in X after cleaning:", X.isnull().any().any())

✅ Cleaned data shape: (896, 34)
✅ Any NaNs in X after cleaning: False


In [28]:
# Step 12: Balance the dataset
df['target'] = y
df_majority = df[df['target'] == 1]
df_minority = df[df['target'] == 0]

df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

df_balanced = pd.concat([df_majority, df_minority_upsampled])
X = df_balanced[selected_features]
y = df_balanced['target']

In [29]:
# Step 13: Normalize features
normalizer = Normalizer()
X_norm = normalizer.fit_transform(X)

In [30]:
# Step 14: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [31]:
# Step 15: Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:

# Step 16: Evaluate
y_pred = model.predict(X_test)
print("\n✅ Model Accuracy:", model.score(X_test, y_test) * 100, "%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))


✅ Model Accuracy: 100.0 %

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       169
         1.0       1.00      1.00      1.00       182

    accuracy                           1.00       351
   macro avg       1.00      1.00      1.00       351
weighted avg       1.00      1.00      1.00       351



In [33]:
# Step 17: Save model and normalizer to root directory (for Flask)
with open("rf_acc_68.pkl", "wb") as f:
    pickle.dump(model, f)

with open("normalizer.pkl", "wb") as f:
    pickle.dump(normalizer, f)

print("✅ Model and normalizer saved successfully.")

✅ Model and normalizer saved successfully.
