In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import calendar as cal
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
filepath = "diabetes.csv"
data_pool = pd.read_csv(filepath)

data_pool.head(5)

In [None]:
data_pool.columns.values

In [None]:
data_pool.shape

In [None]:
data_pool.describe()

In [None]:
data_pool.dtypes

In [None]:
data_pool.describe()

In [None]:
data_pool.isnull().sum()

In [None]:
# Convert string data to number, using Label Encoder
features = data_pool.columns.values
encoders = dict()
for cat in features:
    encoders[cat] = LabelEncoder()
    data_pool[f'{cat}'] = encoders[cat].fit_transform(data_pool[cat])

In [None]:
data_pool.head(5)

In [None]:
# Feature Selection
features = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"] 
target = ["Outcome"]
X = data_pool[features]
y = data_pool[target]
y = y.squeeze() # Convert to Series

In [None]:
# Split data into training and testing data
# Use standard 10% of data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=99) 

In [None]:
X_train.loc[50]

In [None]:
type(X_train)

In [None]:
# Logistic Regression
logistic_model = LogisticRegression(solver='liblinear', random_state=0)
logistic_model.fit(X_train,y_train)
y_pred_lm = logistic_model.predict(X_test)
print("Accuracy:", logistic_model.score(X_test, y_test))

In [None]:
# SVM
from sklearn import svm
svm_model = svm.SVC(kernel='linear', C = 1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("Accuracy:", svm_model.score(X_test, y_test))

In [None]:
# imports for confusion matix
plt.style.use('ggplot')
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Confusion matrix for logistic model
cm = confusion_matrix(y_test, y_pred_lm, labels=logistic_model.classes_) 
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logistic_model.classes_)
disp.plot()
plt.show()

In [None]:
# Confusion matrix for SVM model
cm = confusion_matrix(y_test, y_pred_svm, labels=svm_model.classes_) 
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svm_model.classes_)
disp.plot()
plt.show()

We decided to go with the Logistic regression model

In [None]:
import joblib 

#Serialize save the model as an object
# joblib.dump(logistic_model, 'C:data-files/model.pkl')
joblib.dump(logistic_model, 'C:data-files/diabetes_model.pkl')
print("Model dumped!")

#Serialize save the label encoders as an object
joblib.dump(encoders, 'data-files/diabetes_le.pkl')
print("Encoders dumped!")

#Serialize save the model columns as an object
features_decode = ["age","menopause","tumor-size","inv-nodes","node-caps","deg-malig","breast","breast-quad","irradiat"]
joblib.dump(features_decode, 'data-files/diabetes_model_columns.pkl')
print("Models columns dumped!")