Last things I did:

- Implement SMOTE
- Use SMOTE on SVM
- Try to build a Neural Network with Tensorflow. Watch this: https://youtu.be/VtRLrQ3Ev-U?si=RIMFdXbsnMavwTxH&t=3093

## Importing Libraries

In [1]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense



## NHanes data from 2017 - 2020

In [None]:
df = pd.read_csv('/Users/tobiasmadsen/Documents/UMich/MDST/NHANES/data_files/eda_data.csv')

## Prep

### Train Test Split

In [None]:
# Split the data into X (features) and y (target)
X = df_merged.drop('Doctor_Told_Diabetes', axis=1)
y = df_merged['Doctor_Told_Diabetes']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Scaling the data

In [None]:
# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### SMOTE

In [None]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

## Logistic Regression

In [None]:
# Initialize logistic regression model
logreg = LogisticRegression(max_iter=10000)

# Fit the model to the training data
logreg.fit(X_train_smote, y_train_smote)

# Predict on test data
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

coefficients = logreg.coef_[0]
features = X.columns

coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)


print(sorted_coeff_df)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

# Optionally, visualize the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Reds', 
            xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'])
plt.show()


## Decision Tree

In [None]:
# Initialize decision tree model
dtree = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dtree.fit(X_train, y_train)

# Predict on test data using decision tree
y_pred_tree = dtree.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred_tree)
print(f"Decision Tree Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print(classification_report(y_test, y_pred_tree))


# plt.figure(figsize=(30,15))
# plot_tree(dtree, filled=True, feature_names=list(X.columns), class_names=['No Diabetes', 'Diabetes'], max_depth=3)
# plt.show()

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred_tree)

# Visualize the confusion matrix using Seaborn
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Reds', 
            xticklabels=['Predicted Negative', 'Predicted Positive'], 
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.title('Decision Tree Confusion Matrix')
plt.show()

## Random Forrest

In [None]:
# Initialize random forest model
rf = RandomForestClassifier(random_state=42, n_estimators=100)  # using 100 trees

# Fit the model to the training data
rf.fit(X_train_smote, y_train_smote)

# Predict on test data using random forest
y_pred_rf = rf.predict(X_test_scaled)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print(classification_report(y_test, y_pred_rf))

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Plot using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', 
            xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'])

plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn.fit(X_train_smote, y_train_smote)

# Predict on test data
y_pred_knn = knn.predict(X_test_scaled)

# Evaluate the model
print("KNN Model Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

# Generate and display the confusion matrix for KNN model
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("KNN Model Confusion Matrix:")
print(cm_knn)

# Optionally, visualize the confusion matrix of the KNN model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_knn, annot=True, fmt='g', cmap='Greens', 
            xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'])
plt.title("Confusion Matrix for KNN Model")
plt.show()


### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid: number of neighbors
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize a KNN classifier
knn = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Predict using the best model
best_knn = grid_search.best_estimator_
y_pred_best_knn = best_knn.predict(X_test_scaled)

# Evaluate the best model
print("Accuracy of Best KNN Model:", accuracy_score(y_test, y_pred_best_knn))
print(classification_report(y_test, y_pred_best_knn))


## SVM

In [None]:
from sklearn.svm import SVC

# Initialize the SVM model
svm_model = SVC(kernel="rbf", class_weight="balanced") 

# Fit the model to the training data
svm_model.fit(X_train_smote, y_train_smote)

# Predict on test data
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate the model
print("SVM Model Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# Generate and display the confusion matrix for SVM model
cm_svm = confusion_matrix(y_test, y_pred_svm)
print("SVM Model Confusion Matrix:")
print(cm_svm)

# Optionally, visualize the confusion matrix of the SVM model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm, annot=True, fmt='g', cmap='Blues', 
            xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'])
plt.title("Confusion Matrix for SVM Model")
plt.show()


## Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


In [None]:

model = Sequential()

# Input layer
model.add(Dense(64, activation='relu', input_shape=(X_train_smote.shape[1],)))
model.add(Dropout(0.3))  # 30% dropout

# Hidden layer 1
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))  # 30% dropout

# Hidden layer 2
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))  # 30% dropout

# Hidden layer 3
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))  # 30% dropout

# Output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5)  # stops after 5 epochs of no improvement

history = model.fit(
    X_train_smote, y_train_smote, 
    epochs=50,  # increased number of epochs 
    batch_size=64,  # increased batch size for faster computation
    validation_split=0.2,
    callbacks=[early_stopping]  # using the early stopping callback
)

loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Predicting the Test set results
y_pred_proba = model.predict(X_test_scaled)
y_pred_class = (y_pred_proba > 0.5).astype("int32")  # Convert probabilities to class labels using 0.5 as a threshold

# Generate classification report
print(classification_report(y_test, y_pred_class))

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_class)

# Visualizing the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative (0)', 'Positive (1)'], 
            yticklabels=['Negative (0)', 'Positive (1)'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


## GuassianNB

In [None]:

from sklearn.naive_bayes import GaussianNB

# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Fit the model to your training data
gnb.fit(X_train_smote, y_train_smote)

# Predict on your test data
y_pred_gnb = gnb.predict(X_test_scaled)

# Evaluate the performance
print("Classification Report for Gaussian Naive Bayes:\n")
print(classification_report(y_test, y_pred_gnb))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred_gnb))


## Stacked model

In [None]:
# Predictions from traditional models on the training set
train_pred_logreg = logreg.predict_proba(X_train_smote)[:, 1]
train_pred_knn = knn.predict_proba(X_train_smote)[:, 1]
train_pred_svm = svm_model.decision_function(X_train_smote)
train_pred_rf = rf.predict_proba(X_train_smote)[:, 1]

# Predictions from the neural network on the training set
train_pred_nn = model.predict(X_train_smote).ravel()

# Stack all the predictions together
stacked_train_predictions = np.column_stack((train_pred_logreg, train_pred_knn, train_pred_svm, train_pred_rf, train_pred_nn))


In [None]:
meta_model = LogisticRegression(max_iter=10000)
meta_model.fit(stacked_train_predictions, y_train_smote)


In [None]:
# Predictions from traditional models on the test set
test_pred_logreg = logreg.predict_proba(X_test_scaled)[:, 1]
test_pred_knn = knn.predict_proba(X_test_scaled)[:, 1]
test_pred_svm = svm_model.decision_function(X_test_scaled)
test_pred_rf = rf.predict_proba(X_test_scaled)[:, 1]

# Predictions from the neural network on the test set
test_pred_nn = model.predict(X_test_scaled).ravel()

# Stack all test predictions together
stacked_test_predictions = np.column_stack((test_pred_logreg, test_pred_knn, test_pred_svm, test_pred_rf, test_pred_nn))

# Final predictions from the meta-model
final_predictions = meta_model.predict(stacked_test_predictions)



In [None]:
# Classification report for the meta_model
print("Classification Report for Meta Model:\n")
print(classification_report(y_test, final_predictions))

# Confusion matrix for the meta_model
conf_matrix = confusion_matrix(y_test, final_predictions)

# Plotting the confusion matrix using seaborn
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix for Meta Model')
plt.show()

## 👨‍💻 ML MODEL FOR FIRST STREAMLIT VERSION

Below I will try to make a dumb machine learning model with only gender, hip circumference and moderate work activity as predictors.

Why? 
To make a basic Streamlit model that we can work more on when we have decided on the predictor values we will ask the user.
- this includes taking a string input and converting it into one of the numbers our model us using


In [None]:
#columns_for_dumb_model = ["Doctor_Told_Diabetes", "Gender", "Hip Circumference (cm)", "Vigorous recreational activities"]
#df_3 = df_merged[columns_for_dumb_model]

#df_3.info()

In [None]:
#for column in df_3.columns:
    #print(f"Unique values in {column}:")
    #print(df_3[column].unique())
    #print("-" * 50)  # Just to separate the output for better visibility


In [None]:
# Split the data into X (features) and y (target)
#X = df_3.drop('Doctor_Told_Diabetes', axis=1)
#y = df_3['Doctor_Told_Diabetes']

# Split data into training and test sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

# Initialize logistic regression model
#logreg = LogisticRegression(max_iter=10000)

# Fit the model to the training data
#logreg.fit(X_train, y_train)

# Predict on test data
#y_pred = logreg.predict(X_test)

# Evaluate the model
#print("Accuracy:", accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred))

#coefficients = logreg.coef_[0]
#features = X.columns

#coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

#sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)


#print(sorted_coeff_df)

In [None]:
#X

In [None]:
#X1 = np.array([[2., 172.8, 2.]])
#X1

In [None]:
#y_pred = logreg.predict(X1)
#y_pred

In [None]:
from joblib import dump

# Save the logistic regression model and scaler
#dump(logreg, 'logreg_model.joblib')
#dump(scaler, 'scaler.joblib')