In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pickle

# Step 1: Load Dataset
df = pd.read_csv("/content/drive/MyDrive/ckd project/ckd_decisive_attributes.csv")

# Step 2: Handle missing values
df.replace("?", np.nan, inplace=True)
df = df.dropna()

# Step 3: Convert columns to correct types
numeric_cols = ['sc', 'al', 'sg', 'hemo', 'pcv', 'rc', 'bp', 'bgr', 'bu']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col])

# Encode categorical 'rbc' and 'classification'
le = LabelEncoder()
df['rbc'] = le.fit_transform(df['rbc'])  # normal=1, abnormal=0
df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})

# Step 4: Split features and target
X = df.drop('classification', axis=1)
y = df['classification']

# Step 5: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 7: Model training and evaluation
models = {
    "SVM": SVC(),
    "RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.2f}")

# Step 8: Save the best model (e.g., RandomForest here)
best_model = models["RandomForest"]
with open("ckd_model.pkl", "wb") as f:
    pickle.dump(best_model, f)


SVM Accuracy: 1.00
RandomForest Accuracy: 1.00


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 1.00
LogisticRegression Accuracy: 1.00
DecisionTree Accuracy: 1.00


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from google.colab import drive



# Handle missing values BEFORE encoding
df.replace("?", np.nan, inplace=True)
df = df.dropna()

# Encode target column if not numeric - fit ONLY on the remaining data
if df['classification'].dtype == 'object':
    target_le = LabelEncoder()
    # Fit target_le only on the cleaned data's classification column
    df['classification'] = target_le.fit_transform(df['classification'])
else:
    target_le = None

# Encode categorical features (save encoders)
feature_encoders = {}
# Use df.select_dtypes(include='object').columns to find object columns
for col in df.select_dtypes(include='object').columns:
    if col != 'classification': # Ensure 'classification' is not processed here if already encoded
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        feature_encoders[col] = le


# Separate features and target
X = df.drop('classification', axis=1)
y = df['classification']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save column names for later use
trained_features_cols = X_train.columns.tolist()

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("✅ CKD Model Evaluation:")
print("Accuracy:", accuracy)
print("\nClassification Report:")

# Get the actual class names from the fitted target_le
# Ensure that the classes are in the correct order if needed,
# but LabelEncoder usually sorts them.
# You can explicitly define the target names if the encoder order is uncertain
# target_names = ['notckd', 'ckd'] # Assuming 'notckd' maps to 0 and 'ckd' maps to 1 after dropping '?'
# if target_le:
#    print(classification_report(y_test, y_pred, target_names=target_names))
# else:
#    print(classification_report(y_test, y_pred))

# Use the classes from the LabelEncoder fitted *after* dropping NaNs
if target_le:
     # Use target_le.classes_ only if it exists and has the correct number of classes
     if len(target_le.classes_) == 2:
         print(classification_report(y_test, y_pred, target_names=target_le.classes_))
     else:
         # Fallback or error handling if encoder classes don't match y_test/y_pred
         print("Warning: LabelEncoder classes do not match prediction classes.")
         print(classification_report(y_test, y_pred))
else:
    print(classification_report(y_test, y_pred))


# Save model and components
model_save_path = '/content/drive/MyDrive/ckd project/ckd_model.pkl'
target_encoder_save_path = '/content/drive/MyDrive/ckd project/target_encoder.pkl'
feature_encoders_save_path = '/content/drive/MyDrive/ckd project/feature_encoders.pkl'
feature_columns_save_path = '/content/drive/MyDrive/ckd project/feature_columns.pkl'

joblib.dump(model, model_save_path)
if target_le and len(target_le.classes_) == 2: # Save only if the target encoder was used and has 2 classes
    joblib.dump(target_le, target_encoder_save_path)
joblib.dump(feature_encoders, feature_encoders_save_path)
joblib.dump(trained_features_cols, feature_columns_save_path)

print(f"\n✅ Model saved to: {model_save_path}")
if target_le and len(target_le.classes_) == 2:
    print(f"✅ Target encoder saved to: {target_encoder_save_path }")
print(f"✅ Feature encoders saved to: {feature_encoders_save_path}")
print(f"✅ Feature columns list saved to: {feature_columns_save_path}")

✅ CKD Model Evaluation:
Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           2       1.00      1.00      1.00        25

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54


✅ Model saved to: /content/drive/MyDrive/ckd project/ckd_model.pkl
✅ Feature encoders saved to: /content/drive/MyDrive/ckd project/feature_encoders.pkl
✅ Feature columns list saved to: /content/drive/MyDrive/ckd project/feature_columns.pkl


In [8]:
# Load the model
with open("ckd_model.pkl", "rb") as f:
    model = pickle.load(f)

# Example input (replace with real input values)
input_data = [[1.2, 2, 1.015, 12.5, 40, 4.8, 80, 145, 35, 1]]  # last value is 'rbc' encoded

# Standardize input (use same scaler from training)
input_scaled = scaler.transform(input_data)

# Make prediction
prediction = model.predict(input_scaled)
print("CKD Detected" if prediction[0] == 1 else "No CKD Detected")


CKD Detected




In [9]:
# Example input representing CKD Not Detected
# [sc, al, sg, hemo, pcv, rc, bp, bgr, bu, rbc]
example_input = [[1.0, 0, 1.020, 14.0, 44, 5.0, 80, 120, 15, 1]]

# Scale input (ensure you're using the same scaler from training)
input_scaled = scaler.transform(example_input)

# Predict
prediction = model.predict(input_scaled)

# Output result
print("CKD Detected" if prediction[0] == 1 else "No CKD Detected")


No CKD Detected


