# Import Required Libraries
Import the necessary libraries, including pandas, scikit-learn, and joblib.

In [59]:
# Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load Dataset
Load the Chronic Kidney Disease Dataset using pandas.

In [60]:
# Load Dataset
df = pd.read_csv(r"Chronic Kidney Disease Dataset.csv")  # Load the dataset into a DataFrame
df.head()  # Display the first few rows of the dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


# Preprocess Data
Handle missing values, encode categorical variables, and normalize numerical features.

In [61]:
# Preprocess Data

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill missing numerical values with median
df.fillna(df.mode().iloc[0], inplace=True)  # Fill missing categorical values with mode

# Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Normalize numerical features
from sklearn.preprocessing import StandardScaler
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Check the column names to find the correct target column name
print(df.columns)

print(df.head())

# Assuming 'class' is the target column, replace 'class' with the correct column name if different
target_column = "classification_ckd\t"  # Replace 'class' with the correct column name if different

# Split the data into features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod',
       ...
       'dm_ yes', 'dm_no', 'dm_yes', 'cad_no', 'cad_yes', 'appet_poor',
       'pe_yes', 'ane_yes', 'classification_ckd\t', 'classification_notckd'],
      dtype='object', length=210)
         id       age        bp        sg        al        su       bgr  \
0 -1.727726 -0.210031  0.254214  0.421486  0.076249 -0.380269 -0.320122   
1 -1.719066 -2.627234 -1.972476  0.421486  2.363728 -0.380269 -0.320122   
2 -1.710406  0.615355  0.254214 -1.421074  0.838742  2.507853  3.697618   
3 -1.701745 -0.210031 -0.488016 -2.342354  2.363728 -0.380269 -0.373337   
4 -1.693085 -0.033163  0.254214 -1.421074  0.838742 -0.380269 -0.519679   

         bu        sc       sod  ...  dm_ yes  dm_no  dm_yes  cad_no  cad_yes  \
0 -0.419451 -0.319668  0.040104  ...    False  False    True    True    False   
1 -0.784315 -0.390819  0.040104  ...    False   True   False    True    False   
2 -0.074858 -0.212942  0.040104  ...    F

# Split Dataset into Training and Testing Sets
Use train_test_split from scikit-learn to split the dataset into training and testing sets.

In [62]:
# Split Dataset into Training and Testing Sets

# Use train_test_split from scikit-learn to split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
Train a Random Forest model using the training set.

In [63]:
# Train Random Forest Model

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Save the trained model to a file
joblib.dump(rf_model, 'models/random_forest_model.pkl')

Accuracy: 1.00


['models/random_forest_model.pkl']

# Evaluate Model Accuracy
Evaluate the model's accuracy using the testing set and print the accuracy score.

In [64]:
# Evaluate Model Accuracy

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


# Save the Model
Save the trained model to a folder using joblib.

In [65]:
# Save the Model

# Save the trained model to a file using joblib
import os

# Create a directory to save the model if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the model
joblib.dump(rf_model, 'models/random_forest_model.pkl')

['models/random_forest_model.pkl']