In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Load the dataset from the CSV file
df = pd.read_csv('kidney_disease.csv')

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Assuming 'classification' is the target variable
# Separate features (X) and target variable (y)
X = df.drop('classification', axis=1)
y = df['classification']

# Handle missing values by imputing with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# One-hot encode categorical variables
X_imputed = pd.get_dummies(X_imputed, drop_first=True)

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

# Standardize the features using StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Create a Logistic Regression model and fit it to the training data
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Display confusion matrix and classification report
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")




Missing values:
 id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64


  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)
  X_imputed = pd.get_dummies(X_imputed, drop_first=True)


Confusion Matrix:
 [[76  0]
 [ 0 44]]

Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        76
      notckd       1.00      1.00      1.00        44

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Accuracy: 1.0000
F1 Score: 1.0000
