In [111]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
import joblib

In [112]:
file_path = "../datasets/HepatitisC.csv"
data = pd.read_csv(file_path)
data = data.dropna()
data = data.drop(columns=["ID"])

In [113]:
X = data.drop(['Category'], axis=1)
y = data['Category']

# Split the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

Index(['Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA',
       'GGT', 'PROT'],
      dtype='object')


In [114]:
# Define TensorFlow/Keras models
def create_tf_model(hidden_layer_sizes=[100], activation='relu', learning_rate=0.001):
    model = Sequential()
    for units in hidden_layer_sizes:
        model.add(Dense(units, activation=activation))
    model.add(Dense(4, activation='softmax'))  # Output layer for multi-class classification
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [115]:
# Define parameter grids for hyperparameter tuning
param_grid_log_reg = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs']
}

param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

In [116]:
# Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Create pipelines for models
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(multi_class='ovr', max_iter=1000))
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [117]:
# Grid Search for Logistic Regression
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_log_reg, cv=cv, scoring='accuracy')

# Grid Search for Random Forest
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=cv, scoring='accuracy')

# Fit models
pipeline_lr.fit(X_train, y_train)
pipeline_rf.fit(X_train, y_train)
grid_search_lr.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)

# Get best models
best_lr = grid_search_lr.best_estimator_
best_rf = grid_search_rf.best_estimator_

# Prepare data for TensorFlow model
X_train_nn = preprocessor.fit_transform(X_train)
X_val_nn = preprocessor.transform(X_val)
X_test_nn = preprocessor.transform(X_test)



KeyboardInterrupt: 

In [None]:
# Define and train TensorFlow model
def train_tf_model(X_train_nn, y_train):
    model = create_tf_model(hidden_layer_sizes=[100], activation='relu', learning_rate=0.001)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    model.fit(X_train_nn, y_train, validation_split=0.2, epochs=100, batch_size=10, verbose=0, callbacks=[early_stopping])
    return model

tf_model = train_tf_model(X_train_nn, y_train)

# Evaluate TensorFlow model
def evaluate_tf_model(model, X_val_nn, y_val):
    y_pred = model.predict(X_val_nn).argmax(axis=1)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

nn_accuracy = evaluate_tf_model(tf_model, X_val_nn, y_val)
print(f"Neural Network Validation Accuracy: {nn_accuracy}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Neural Network Validation Accuracy: 0.9659090909090909


In [None]:
# Evaluate TensorFlow model
def evaluate_tf_model(model, X_val_nn, y_val):
    y_pred = model.predict(X_val_nn).argmax(axis=1)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

nn_accuracy = evaluate_tf_model(tf_model, X_val_nn, y_val)
print(f"Neural Network Validation Accuracy: {nn_accuracy}")

# Define ensemble models
models = [
    ('Logistic Regression', best_lr),
    ('Random Forest', best_rf)
]

stacking_clf = StackingClassifier(
    estimators=[('Logistic Regression', best_lr), ('Random Forest', best_rf)],
    final_estimator=LogisticRegression()
)

voting_clf = VotingClassifier(
    estimators=[('Logistic Regression', best_lr), ('Random Forest', best_rf)],
    voting='soft'
)

# Fit ensemble models
stacking_clf.fit(X_train, y_train)
voting_clf.fit(X_train, y_train)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Neural Network Validation Accuracy: 0.9659090909090909




In [None]:
# Determine the best model
best_model_name, best_model = None, None
best_accuracy = -float('inf')

for name, model in models + [('Stacking Classifier', stacking_clf), ('Voting Classifier', voting_clf)]:
    y_pred = model.predict(X_val)
    current_accuracy = accuracy_score(y_val, y_pred)
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_model_name = name
        best_model = model

print(f"Best Model: {best_model_name}")

# Test set evaluation
y_test_pred = best_model.predict(X_test)
print(f"Best Model ({best_model_name}) Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Best Model ({best_model_name}) Test Classification Report:\n{classification_report(y_test, y_test_pred)}")

# Save the best model and preprocessor
joblib.dump(best_model, 'best_model.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')

# Load the best model and preprocessor
best_model_loaded = joblib.load('best_model.joblib')
preprocessor_loaded = joblib.load('preprocessor.joblib')

# Function to predict category based on new input
def predict_category(input_data):
    input_df = pd.DataFrame([input_data])
    input_processed = preprocessor_loaded.transform(input_df)
    prediction = best_model_loaded.predict(input_processed)
    return prediction[0]

# Example usage
input_data = {
    'Age': 30,
    'Sex': 'm',
    'ALB': 38.5,
    'ALP': 52.5,
    'ALT': 7.7,
    'AST': 22.1,
    'BIL': 7.5,
    'CHE': 6.93,
    'CHOL': 3.23,
    'CREA': 106,
    'GGT': 12.1,
    'PROT': 76.8
}

predicted_category = predict_category(input_data)
print(f"Predicted Category: {predicted_category}")

Best Model: Logistic Regression
Best Model (Logistic Regression) Test Accuracy: 0.9662921348314607
Best Model (Logistic Regression) Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        80
           1       1.00      0.67      0.80         3
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         4

    accuracy                           0.97        89
   macro avg       0.74      0.67      0.70        89
weighted avg       0.95      0.97      0.95        89



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ValueError: X has 13 features, but ColumnTransformer is expecting 12 features as input.