In [200]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [201]:
df=pd.read_csv('indian_liver_patient.csv')

In [204]:
# Create dummy or indicator features for categorical variables
df_dummies = pd.get_dummies(df, columns=['Gender'], drop_first=True)

# Select numeric columns for scaling
numeric_cols = df_dummies.select_dtypes(include=['float64', 'int64']).columns

# Initialize the MinMaxScaler to scale features between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the numeric columns to scale them between 0 and 1
df_dummies[numeric_cols] = scaler.fit_transform(df_dummies[numeric_cols])

# Define features (X) and target (y)
X = df_dummies.drop(columns=['Dataset'])
y = df['Dataset']

# Check for NaN values before imputation
print("NaN values before imputation:")
print(X.isna().sum())

# Handle missing values by using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Convert the imputed array back to a DataFrame
X = pd.DataFrame(X, columns=df_dummies.drop(columns=['Dataset']).columns)

# Check for NaN values after imputation
print("NaN values after imputation:")
print(X.isna().sum())

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the first few rows of the transformed data
print(df_dummies.head())

NaN values before imputation:
Age                           0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Gender_Male                   0
dtype: int64
NaN values after imputation:
Age                           0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Gender_Male                   0
dtype: int64
        Age  Total_Bilirubin  Direct_Bilirubin  Alkaline_Phosphotase  \
0  0.709302         0.004021          0.000000              0.060576   
1  0.674419         0.140751          0.275510              0.310699   
2  0.674419         0.092493          0.204082             

**Random Forest Model**

In [192]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [171]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Print confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        87
           2       1.00      1.00      1.00        30

    accuracy                           1.00       117
   macro avg       1.00      1.00      1.00       117
weighted avg       1.00      1.00      1.00       117

Confusion Matrix:
[[87  0]
 [ 0 30]]


In [173]:
print("Recall score:", recall_score(y_test, y_pred))
print("Precision score:", precision_score(y_test, y_pred))

Recall score: 1.0
Precision score: 1.0


In [175]:
rf_grid = RandomForestClassifier()
gr_space = {
    'max_depth': [3,5,7,10],
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': [10, 20, 30 , 40],
    'min_samples_leaf': [1, 2, 4]
}

grid = GridSearchCV(rf_grid, gr_space, cv = 3, scoring='accuracy')
model_grid = grid.fit(X_train, y_train)

print('Best hyperparameters are '+str(model_grid.best_params_))
print('Best score is: ' + str(model_grid.best_score_))

Best hyperparameters are {'max_depth': 3, 'max_features': 10, 'min_samples_leaf': 1, 'n_estimators': 100}
Best score is: 1.0


**Logistic Regression Model**

In [210]:
# Initialize the LogisticRegression model
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)

# Create a parameter grid
param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}

# Fit the model to the training data
log_reg_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Print confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

           1       0.76      1.00      0.86        87
           2       1.00      0.07      0.12        30

    accuracy                           0.76       117
   macro avg       0.88      0.53      0.49       117
weighted avg       0.82      0.76      0.67       117

Confusion Matrix:
[[87  0]
 [28  2]]


In [218]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver='saga', tol=0.1))
grid_search = GridSearchCV(pipe, param_grid, cv=3)
grid_search.fit(X, y)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1'}
Best Score: 0.7169882809058067


**KNN Model**

In [221]:
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Print confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.67
Classification Report:
              precision    recall  f1-score   support

           1       0.79      0.75      0.77        87
           2       0.37      0.43      0.40        30

    accuracy                           0.67       117
   macro avg       0.58      0.59      0.58       117
weighted avg       0.68      0.67      0.67       117

Confusion Matrix:
[[65 22]
 [17 13]]


In [223]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1'}
Best Score: 0.7169882809058067


In [229]:
# Make predictions on the test data
y_pred_log_reg = log_reg_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_knn = knn_model.predict(X_test)

# Calculate accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Print accuracy scores
print(f'Logistic Regression Accuracy: {accuracy_log_reg:.2f}')
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')
print(f'K-Nearest Neighbors Accuracy: {accuracy_knn:.2f}')

Logistic Regression Accuracy: 0.76
Random Forest Accuracy: 0.75
K-Nearest Neighbors Accuracy: 0.67
