In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from logisitic_regression import LogisticRegression
from neural_network import NeuralNetwork
from random_forest_classifier import RandomForestClassifier
from kmeans import KMeans

In [22]:
file = '../datasets/Cirrhosis.csv'
data = pd.read_csv(file)

data.dropna(inplace=True)
data.drop(columns=['ID', 'N_Days'], inplace=True)
data['Age'] = data['Age'] / 365.25  # Convert age from days to years

X = data.drop('Stage', axis=1)
y = data['Stage']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessing for numerical data: scaling
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
# Preprocessing for categorical data: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess data
X_processed = preprocessor.fit_transform(X)

In [23]:
# Split the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_processed, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [24]:
# Train the Logistic Regression model
model = LogisticRegression(learning_rate=0.01, num_iterations=1000, C=1.0)
model.fit(X_train, y_train, num_classes=len(np.unique(y)))

y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

print("Validation Set Performance:")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

print("\nTest Set Performance:")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Validation Set Performance:
[[ 0  1  0  0]
 [ 0  1  6  0]
 [ 0  3 14  0]
 [ 0  2 14  0]]
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         1
         2.0       0.14      0.14      0.14         7
         3.0       0.41      0.82      0.55        17
         4.0       0.00      0.00      0.00        16

    accuracy                           0.37        41
   macro avg       0.14      0.24      0.17        41
weighted avg       0.20      0.37      0.25        41

Validation Accuracy: 0.36585365853658536

Test Set Performance:
[[ 0  0  2  0]
 [ 0  2  9  0]
 [ 0  3 16  0]
 [ 0  0 10  0]]
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         2
         2.0       0.40      0.18      0.25        11
         3.0       0.43      0.84      0.57        19
         4.0       0.00      0.00      0.00        10

    accuracy                           0.43        42
   macro avg       0.21  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Initialize and train the KMeans model
kmeans = KMeans(k=4, max_iters=100)
kmeans.fit(X_train)

# Predict on validation and test sets
y_val_pred = kmeans.predict(X_val)
y_test_pred = kmeans.predict(X_test)

# Convert one-hot encoded labels back to original labels for evaluation
y_val = np.argmax(y_val, axis=1)
y_test = np.argmax(y_test, axis=1)

# Evaluate KMeans performance
print("KMeans Validation Set Performance:")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

print("\nKMeans Test Set Performance:")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

In [None]:
# Train the RandomForestClassifier model
rf = RandomForestClassifier(n_estimators=100, max_depth=None)
y_train_labels = np.argmax(y_train, axis=1)  # Convert one-hot to single labels
rf.fit(X_train, y_train_labels)

y_val_pred_rf = rf.predict(X_val)
y_test_pred_rf = rf.predict(X_test)

print("\nRandom Forest Validation Set Performance:")
print(confusion_matrix(y_val, y_val_pred_rf))
print(classification_report(y_val, y_val_pred_rf))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))

print("\nRandom Forest Test Set Performance:")
print(confusion_matrix(y_test, y_test_pred_rf))
print(classification_report(y_test, y_test_pred_rf))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred_rf))

In [None]:
# Train the KMeans model
kmeans = KMeans(k=4, max_iters=100)
kmeans.fit(X_train)

y_val_pred = kmeans.predict(X_val)
y_test_pred = kmeans.predict(X_test)

y_val = np.argmax(y_val, axis=1)
y_test = np.argmax(y_test, axis=1)

print("KMeans Validation Set Performance:")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

print("\nKMeans Test Set Performance:")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))