In [1]:
import sys
sys.path.append('src')
from sklearn.svm import SVC
from datetime import datetime
from src.Printer import Printer
from src.DataHelper import DataHelper
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline

In [2]:
printer = Printer(enabled=False)
dataHelper = DataHelper(show_tables=False, printer=printer)

In [4]:
df = pd.read_csv("data/diabetes_prediction_dataset.csv")

dataHelper.showInitData(df)

In [5]:
df_encoded = dataHelper.format_obj_col(df)

In [6]:
X_balanced, y_balanced = dataHelper.smote_resample(df_encoded)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [8]:
X_train_clean, y_train_clean = dataHelper.IQR(X_train, y_train)

# Models

## SVM Model

In [23]:
current_time = datetime.now()
print("Starting SVM linear", current_time.strftime("%Y-%m-%d %H:%M:%S"))
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_clean.head(10000), y_train_clean.head(10000))

Starting SVM linear 2024-03-17 02:40:13


SVC(kernel='linear')

### Testing svm linear

In [25]:
svm_linear_predictions = svm_linear.predict(X_test.head(10000))
svm_linear_accuracy = accuracy_score(y_test.head(10000), svm_linear_predictions)
print(f'SVM Linear Kernel Accuracy: {svm_linear_accuracy:.2f}')

SVM Linear Kernel Accuracy: 0.89


### Testing svm nonlinear

In [30]:
current_time = datetime.now()
print(" Starting SVM nonlinear", current_time.strftime("%Y-%m-%d %H:%M:%S"))
svm_rbf = SVC(kernel='rbf', gamma='auto', C=1)
svm_rbf.fit(X_train_clean.head(10000), y_train_clean.head(10000))

 Starting SVM nonlinear 2024-03-17 02:44:16


SVC(C=1, gamma='auto')

In [33]:
svm_rbf_predictions = svm_rbf.predict(X_test)
svm_rbf_accuracy = accuracy_score(y_test, svm_rbf_predictions)
print(f'SVM RBF Kernel Accuracy: {svm_rbf_accuracy:.2f}')

SVM RBF Kernel Accuracy: 0.85


## MLP Model

## Creating layers

In [34]:
current_time = datetime.now()
print(" Starting MLP Model", current_time.strftime("%Y-%m-%d %H:%M:%S"))
mlp_model = Sequential()
mlp_model.add(Dense(32, activation='relu', input_shape=(X_train_clean.shape[1],)))
mlp_model.add(Dense(16, activation='relu'))
mlp_model.add(Dense(1, activation='sigmoid'))

mlp_model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

 Starting MLP Model 2024-03-17 02:47:57


## Fitting model

In [35]:
mlp_history = mlp_model.fit(X_train_clean, y_train_clean, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

## Testing

In [37]:
mlp_scores = mlp_model.evaluate(X_test, y_test)
print(f'MLP Accuracy: {mlp_scores[1]:.2f}')

MLP Accuracy: 0.90


## RBF Model

In [51]:
# Define the RBFSampler
rbf_feature = RBFSampler(gamma=0.1, random_state=1)

# Define the SVM classifier
svm_linear_for_rbf = SVC(kernel='linear', C=1)

# Create a pipeline
rbf_model = make_pipeline(rbf_feature, svm_linear_for_rbf)

# Fit the model to the training data
rbf_model.fit(X_train_clean.head(10000), y_train_clean.head(10000))

Pipeline(steps=[('rbfsampler', RBFSampler(gamma=0.1, random_state=1)),
                ('svc', SVC(C=1, kernel='linear'))])

## Testing

In [52]:
# Predict on the test set
rbf_predictions = rbf_model.predict(X_test)

# Evaluate the model
rbf_accuracy = accuracy_score(y_test, rbf_predictions)
rbf_report = classification_report(y_test, rbf_predictions)

print("RBF Model Accuracy:", rbf_accuracy)
print("RBF Model Classification Report:\n", rbf_report)

RBF Model Accuracy: 0.593688524590164
RBF Model Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.55      0.57     18293
           1       0.59      0.64      0.61     18307

    accuracy                           0.59     36600
   macro avg       0.59      0.59      0.59     36600
weighted avg       0.59      0.59      0.59     36600



## RBF Model

In [50]:
# Define the RBFSampler
rbf_feature = RBFSampler(random_state=1)

# Define the SVM classifier
svm_linear_for_rbf = SVC(kernel='linear')

# Create a pipeline
rbf_model = make_pipeline(rbf_feature, svm_linear_for_rbf)

# Define the parameter grid
param_grid = {
    'rbfsampler__gamma': [0.1, 1, 10],  # Adjust the gamma values as needed
    'svc__C': [0.1, 1, 10],  # Adjust the C values as needed
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rbf_model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train_clean.head(10000), y_train_clean.head(10000))

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Predict on the test set using the best estimator
y_pred = best_estimator.predict(X_test.head(10000))

# Evaluate the model
accuracy = accuracy_score(y_test.head(10000), y_pred)
report = classification_report(y_test.head(10000), y_pred)

print("Model Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'rbfsampler__gamma': 0.1, 'svc__C': 1}
Model Accuracy: 0.5895
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.55      0.57      5023
           1       0.58      0.63      0.61      4977

    accuracy                           0.59     10000
   macro avg       0.59      0.59      0.59     10000
weighted avg       0.59      0.59      0.59     10000



In [30]:
print("RBF Model Accuracy:", rbf_accuracy)
print(f'MLP Accuracy: {mlp_scores[1]:.2f}')
print(f'SVM RBF Kernel Accuracy: {svm_rbf_accuracy:.2f}')
print(f'SVM Linear Kernel Accuracy: {svm_linear_accuracy:.2f}')

RBF Model Accuracy: 0.593688524590164
MLP Accuracy: 0.89
SVM RBF Kernel Accuracy: 0.85
SVM Linear Kernel Accuracy: 0.89


In [37]:
type(X_test.head(1))


pandas.core.frame.DataFrame

In [39]:
svm_linear_predictions

array([1, 0, 0, ..., 1, 1, 0], dtype=int64)

In [52]:
# Define the features for the individual case with all columns specified
data = {
    'gender': [0],
    'age': [54.0],
    'hypertension': [0],
    'heart_disease': [0],
    'smoking_history': [0],
    'bmi': [27.32],
    'HbA1c_level': [6.6],
    'blood_glucose_level': [80]
}

# Create a DataFrame for the individual case with the same columns as the training data
individual_case = pd.DataFrame(data, columns=X_train.columns)

# Use svm_linear.predict on this individual_case
prediction = svm_linear.predict(individual_case)

# Print the prediction
print("Prediction:", prediction)

Prediction: [0]
