Importing the Dependencies

In [703]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

Data Collection and Analysis

PIMA Diabetes Dataset

In [704]:
# loading the diabetes dataset to a pandas DataFrame
feature_names = ['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi', 'dpf', 'age', 'outcome']
diabetes_dataset = pd.read_csv('diabetes.csv', names=feature_names)

In [705]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [706]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(768, 9)

In [707]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,dpf,age,outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [708]:
diabetes_dataset['outcome'].value_counts()

0    500
1    268
Name: outcome, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [709]:
diabetes_dataset.groupby('outcome').mean()

Unnamed: 0_level_0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,dpf,age
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [710]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'outcome', axis=1)
y = diabetes_dataset['outcome']

In [711]:
feature_names = X.columns.tolist()

In [712]:
print(X)

     pregnancies  glucose  bloodpressure  skinthickness  insulin   bmi    dpf  \
0              6      148             72             35        0  33.6  0.627   
1              1       85             66             29        0  26.6  0.351   
2              8      183             64              0        0  23.3  0.672   
3              1       89             66             23       94  28.1  0.167   
4              0      137             40             35      168  43.1  2.288   
..           ...      ...            ...            ...      ...   ...    ...   
763           10      101             76             48      180  32.9  0.171   
764            2      122             70             27        0  36.8  0.340   
765            5      121             72             23      112  26.2  0.245   
766            1      126             60              0        0  30.1  0.349   
767            1       93             70             31        0  30.4  0.315   

     age  
0     50  
1    

In [713]:
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: outcome, Length: 768, dtype: int64


Data Standardization

In [714]:
# scaler = StandardScaler()

In [715]:
# scaler.fit(X)

In [716]:
# standardized_data = scaler.transform(X)

In [717]:
# print(standardized_data)

In [718]:
# X = standardized_data
y = diabetes_dataset['outcome']

In [719]:
print(X)
print(y)

     pregnancies  glucose  bloodpressure  skinthickness  insulin   bmi    dpf  \
0              6      148             72             35        0  33.6  0.627   
1              1       85             66             29        0  26.6  0.351   
2              8      183             64              0        0  23.3  0.672   
3              1       89             66             23       94  28.1  0.167   
4              0      137             40             35      168  43.1  2.288   
..           ...      ...            ...            ...      ...   ...    ...   
763           10      101             76             48      180  32.9  0.171   
764            2      122             70             27        0  36.8  0.340   
765            5      121             72             23      112  26.2  0.245   
766            1      126             60              0        0  30.1  0.349   
767            1       93             70             31        0  30.4  0.315   

     age  
0     50  
1    

Train Test Split

In [720]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.09, stratify=y, random_state=42)

In [721]:
# Get the feature names
# feature_names = X.columns.tolist()

In [722]:
# # Initialize and fit the scaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [723]:
# print(X.shape, X_train.shape, X_test.shape)

Training the Model

In [724]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Assuming X_train and y_train are your training data
lr_model = LogisticRegression()

# Cross-validated accuracy scores
cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

# Mean accuracy
print("Cross-validated accuracy scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())

# Training data accuracy
lr_model.fit(X_train, y_train)
training_data_accuracy_lr = lr_model.score(X_train, y_train)
print('Accuracy score of the training data:', training_data_accuracy_lr)

# Test data accuracy
test_data_accuracy_lr = lr_model.score(X_test, y_test)
print('Accuracy score of the test data:', test_data_accuracy_lr)

# Confusion Matrix
y_pred_lr = lr_model.predict(X_test)
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
print('Confusion Matrix:\n', conf_matrix_lr)

# Classification Report
class_report_lr = classification_report(y_test, y_pred_lr)
print('Classification Report:\n', class_report_lr)


Cross-validated accuracy scores: [0.75       0.8        0.71428571 0.8057554  0.76258993]
Mean accuracy: 0.7665262076053443
Accuracy score of the training data: 0.7822349570200573
Accuracy score of the test data: 0.8
Confusion Matrix:
 [[38  8]
 [ 6 18]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.84        46
           1       0.69      0.75      0.72        24

    accuracy                           0.80        70
   macro avg       0.78      0.79      0.78        70
weighted avg       0.80      0.80      0.80        70



In [725]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()

cv_scores = cross_val_score(dt_model, X_train, y_train, cv=5)
print("Cross-validated accuracy scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())

dt_model.fit(X_train, y_train)
training_data_accuracy_dt = dt_model.score(X_train, y_train)
print('Accuracy score of the training data:', training_data_accuracy_dt)

test_data_accuracy_dt = dt_model.score(X_test, y_test)
print('Accuracy score of the test data:', test_data_accuracy_dt)

# Confusion Matrix
y_pred_dt = dt_model.predict(X_test)
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
print('Confusion Matrix:\n', conf_matrix_dt)

# Classification Report
class_report_dt = classification_report(y_test, y_pred_dt)
print('Classification Report:\n', class_report_dt)


Cross-validated accuracy scores: [0.6        0.68571429 0.62857143 0.71942446 0.69064748]
Mean accuracy: 0.6648715313463515
Accuracy score of the training data: 1.0
Accuracy score of the test data: 0.7
Confusion Matrix:
 [[35 11]
 [10 14]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.76      0.77        46
           1       0.56      0.58      0.57        24

    accuracy                           0.70        70
   macro avg       0.67      0.67      0.67        70
weighted avg       0.70      0.70      0.70        70



In [726]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-validated accuracy scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())

rf_model.fit(X_train, y_train)
accuracy_rf = rf_model.score(X_train, y_train)
print(f'rf Accuracy: {accuracy_rf}')

# Test data accuracy
test_data_accuracy_rf = rf_model.score(X_test, y_test)
print('Accuracy score of the test data:', test_data_accuracy_rf)

# Confusion Matrix
y_pred_rf = rf_model.predict(X_test)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print('Confusion Matrix:\n', conf_matrix_rf)

# Classification Report
class_report_rf = classification_report(y_test, y_pred_rf)
print('Classification Report:\n', class_report_rf)

Cross-validated accuracy scores: [0.74285714 0.8        0.66428571 0.79136691 0.75539568]
Mean accuracy: 0.750781089414183
rf Accuracy: 1.0
Accuracy score of the test data: 0.8142857142857143
Confusion Matrix:
 [[38  8]
 [ 5 19]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85        46
           1       0.70      0.79      0.75        24

    accuracy                           0.81        70
   macro avg       0.79      0.81      0.80        70
weighted avg       0.82      0.81      0.82        70



In [727]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()

cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)
print("Cross-validated accuracy scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())

xgb_model.fit(X_train, y_train)
accuracy_xgb = xgb_model.score(X_train, y_train)
print(f'XGBoost Accuracy: {accuracy_xgb}')

# Test data accuracy
test_data_accuracy_xgb = xgb_model.score(X_test, y_test)
print('Accuracy score of the test data:', test_data_accuracy_xgb)

# Confusion Matrix
y_pred_xgb = xgb_model.predict(X_test)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print('Confusion Matrix:\n', conf_matrix_xgb)

# Classification Report
class_report_xgb = classification_report(y_test, y_pred_xgb)
print('Classification Report:\n', class_report_xgb)

Cross-validated accuracy scores: [0.70714286 0.76428571 0.67142857 0.77697842 0.75539568]
Mean accuracy: 0.7350462487153134
XGBoost Accuracy: 1.0
Accuracy score of the test data: 0.8142857142857143
Confusion Matrix:
 [[38  8]
 [ 5 19]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85        46
           1       0.70      0.79      0.75        24

    accuracy                           0.81        70
   macro avg       0.79      0.81      0.80        70
weighted avg       0.82      0.81      0.82        70



In [728]:
from sklearn.svm import SVC

svm_model = SVC()

cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)
print("Cross-validated accuracy scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())

svm_model.fit(X_train, y_train)
accuracy_svm = svm_model.score(X_train, y_train)
print(f'SVM Accuracy: {accuracy_svm}')

# Test data accuracy
test_data_accuracy_svm = svm_model.score(X_test, y_test)
print('Accuracy score of the test data:', test_data_accuracy_svm)

# Confusion Matrix
y_pred_svm = svm_model.predict(X_test)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
print('Confusion Matrix:\n', conf_matrix_svm)

# Classification Report
class_report_svm = classification_report(y_test, y_pred_svm)
print('Classification Report:\n', class_report_svm)

Cross-validated accuracy scores: [0.75       0.77857143 0.7        0.79856115 0.75539568]
Mean accuracy: 0.7565056526207605
SVM Accuracy: 0.7693409742120344
Accuracy score of the test data: 0.7428571428571429
Confusion Matrix:
 [[39  7]
 [11 13]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.85      0.81        46
           1       0.65      0.54      0.59        24

    accuracy                           0.74        70
   macro avg       0.72      0.69      0.70        70
weighted avg       0.74      0.74      0.74        70



Making a Predictive System

In [729]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

cv_scores = cross_val_score(knn_model, X_train, y_train, cv=5)
print("Cross-validated accuracy scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())

knn_model.fit(X_train, y_train)
accuracy_knn = knn_model.score(X_train, y_train)
print(f'KNN Accuracy: {accuracy_knn}')

# Test data accuracy
test_data_accuracy_knn = knn_model.score(X_test, y_test)
print('Accuracy score of the test data:', test_data_accuracy_knn)

# Confusion Matrix
y_pred_knn = knn_model.predict(X_test)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
print('Confusion Matrix:\n', conf_matrix_knn)

# Classification Report
class_report_knn = classification_report(y_test, y_pred_knn)
print('Classification Report:\n', class_report_knn)

Cross-validated accuracy scores: [0.71428571 0.75       0.64285714 0.72661871 0.70503597]
Mean accuracy: 0.70775950668037
KNN Accuracy: 0.8051575931232091
Accuracy score of the test data: 0.6857142857142857
Confusion Matrix:
 [[36 10]
 [12 12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.78      0.77        46
           1       0.55      0.50      0.52        24

    accuracy                           0.69        70
   macro avg       0.65      0.64      0.64        70
weighted avg       0.68      0.69      0.68        70



In [736]:
from numpy import ScalarType


input_data = (0,198,66,32,274,41.3,0.502,28)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
# std_data = ScalarType.transform(input_data_reshaped)
# print(std_data)

prediction = rf_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic


In [738]:
import joblib

# Save the Model to a .pkl File
joblib.dump(rf_model, 'diabetes.pkl')
# joblib.dump(scaler, 'diabetes_scaler.pkl' )

['diabetes.pkl']