# ChE 297 - Intro to AI and Machine Learning for Chemical Engineers
## Machine Problem 2
##### Iliad Oleriana
##### MS Chemical Engineering

## 1. Early Stage Diabetes Risk Prediction

In [548]:
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

### Loading the dataset into a DataFrame (df)

In [549]:
df = pd.read_csv('diabetes_data_upload.csv')

In [550]:
display(df)
df.info()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

### Define features and target, encode the target variable, and defining categorical and numerical features

In [551]:
X = df.drop(columns=['class'])
y = df['class']

In [552]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [553]:
categorical_features = ['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity']
numerical_features = ['Age']

In [554]:
numerical_transformer = StandardScaler()

In [555]:
categorical_transformer = OneHotEncoder(drop='first')

In [556]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Split into training and Test Data

In [557]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Optuna: Combined Model Selection and Hyper-parameter Tuning

In [558]:
# Define the objective function for Optuna
def objective(trial):
    # Define the model and its hyperparameters
    classifier_name = trial.suggest_categorical('classifier', ['MLP', 'RandomForest', 'XGBoost', 'LogisticRegression', 'NaiveBayes', 'SVM', 'kNN'])
    
    if classifier_name == 'MLP':
        hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50, 50)])
        alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
        learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-5, 1e-1)
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, alpha=alpha, learning_rate_init=learning_rate_init, max_iter=1000)

    elif classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_depth = trial.suggest_int('max_depth', 1, 20)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    
    elif classifier_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_depth = trial.suggest_int('max_depth', 1, 20)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    
    elif classifier_name == 'LogisticRegression':
        C = trial.suggest_loguniform('C', 1e-5, 1e1)
        model = LogisticRegression(C=C, max_iter=1000)
    
    elif classifier_name == 'NaiveBayes':
        model = GaussianNB()
    
    elif classifier_name == 'SVM':
        C = trial.suggest_loguniform('C', 1e-5, 1e1)
        kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        model = SVC(C=C, kernel=kernel)
    
    elif classifier_name == 'kNN':
        n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Create a pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    # 10-fold cross-validation
    score = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    return np.mean(score)

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=100)

# Print the best trial
print('Best trial:', study.best_trial.params)

# Train the best model on the entire training set
best_params = study.best_trial.params
classifier_name = best_params.pop('classifier')

if classifier_name == 'MLP':
    model = MLPClassifier(**best_params, max_iter=1000)
elif classifier_name == 'RandomForest':
    model = RandomForestClassifier(**best_params)
elif classifier_name == 'XGBoost':
    model = XGBClassifier(**best_params)
elif classifier_name == 'LogisticRegression':
    model = LogisticRegression(**best_params, max_iter=1000)
elif classifier_name == 'NaiveBayes':
    model = GaussianNB()
elif classifier_name == 'SVM':
    model = SVC(**best_params)
elif classifier_name == 'kNN':
    model = KNeighborsClassifier(**best_params)

clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')

[I 2024-06-04 04:21:14,137] A new study created in memory with name: no-name-f84fe471-9bde-4761-95a7-05d517d2a1a2
[I 2024-06-04 04:21:14,334] Trial 0 finished with value: 0.8807692307692309 and parameters: {'classifier': 'NaiveBayes'}. Best is trial 0 with value: 0.8807692307692309.
[I 2024-06-04 04:21:14,518] Trial 1 finished with value: 0.8807692307692309 and parameters: {'classifier': 'NaiveBayes'}. Best is trial 0 with value: 0.8807692307692309.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
[I 2024-06-04 04:21:15,280] Trial 2 finished with value: 0.6153846153846153 and parameters: {'classifier': 'XGBoost', 'n_estimators': 45, 'max_depth': 9, 'learning_rate': 0.0008247811350541594}. Best is trial 0 with value: 0.8807692307692309.
[I 2024-06-04 04:21:15,515] Trial 3 finished with value: 0.8807692307692309 and parameters: {'classifier': 'NaiveBayes'}. Best is trial 0 with value: 0.8807692307692309.
  learning_rate = trial.suggest_loguniform('learning_rate', 1

Best trial: {'classifier': 'RandomForest', 'n_estimators': 45, 'max_depth': 19}
Accuracy: 0.9903846153846154
F1-score: 0.9929078014184397


### Finding the better Random Forest Model with higher F1 Score

In [560]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters for Random Forest
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.2, 0.5, 0.8])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Create a pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    # 10-fold cross-validation
    score = cross_val_score(clf, X, y, cv=10, scoring='f1_weighted')
    return np.mean(score)

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=200)

# Print the best trial
print('Best trial:', study.best_trial.params)

# Train the best model on the entire training set
best_params = study.best_trial.params

model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)

clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')

[I 2024-06-04 04:31:08,913] A new study created in memory with name: no-name-4c4204d1-f73b-481e-be49-2f7c12022c58
[I 2024-06-04 04:31:11,056] Trial 0 finished with value: 0.9262346493638578 and parameters: {'n_estimators': 149, 'max_depth': 28, 'min_samples_split': 6, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 0.9262346493638578.
[I 2024-06-04 04:31:14,866] Trial 1 finished with value: 0.93401265929622 and parameters: {'n_estimators': 264, 'max_depth': 27, 'min_samples_split': 17, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 1 with value: 0.93401265929622.
[I 2024-06-04 04:31:18,775] Trial 2 finished with value: 0.944136176108622 and parameters: {'n_estimators': 294, 'max_depth': 15, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': 0.2}. Best is trial 2 with value: 0.944136176108622.
[I 2024-06-04 04:31:19,643] Trial 3 finished with value: 0.9104515365614894 and parameters: {'n_estimators': 53, 'max_depth': 15, 'min_samples_

Best trial: {'n_estimators': 212, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 0.2}
Accuracy: 0.9903846153846154
F1-score: 0.9904222748776574
