In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

In [2]:
data = pd.read_csv("diabetes75pc_100_times.csv")

In [3]:
data["Outcome"].value_counts()

0    50500
1    27068
Name: Outcome, dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77568 entries, 0 to 77567
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               77568 non-null  int64  
 1   Glucose                   77568 non-null  int64  
 2   BloodPressure             77568 non-null  int64  
 3   SkinThickness             77568 non-null  int64  
 4   Insulin                   77568 non-null  int64  
 5   BMI                       77568 non-null  float64
 6   DiabetesPedigreeFunction  77568 non-null  float64
 7   Age                       77568 non-null  int64  
 8   Outcome                   77568 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 5.3 MB


In [5]:
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [7]:
X_train.shape

(62054, 8)

In [8]:
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]

In [9]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_train)

In [10]:
X_train = pd.DataFrame(scaled_data, columns=columns)

In [11]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.338571,0.092918,0.147859,0.154129,0.278164,-0.617184,-0.696638,-0.276543
1,-0.253827,-1.082823,0.147859,0.717886,-0.691528,0.591661,-0.638609,-0.446725
2,0.338571,0.635567,0.665322,0.342048,1.775991,0.067504,-0.087333,2.106009
3,-0.846226,-3.765922,0.251352,-0.033790,-0.492395,-0.470031,-0.522551,-1.042363
4,-0.253827,-1.806355,0.665322,0.467328,-0.691528,0.369107,-0.696638,1.084916
...,...,...,...,...,...,...,...,...
62049,-1.142425,0.876745,1.079292,1.594842,-0.691528,1.231353,-0.290435,-1.042363
62050,-1.142425,-0.510026,0.044366,1.031085,0.589850,0.965017,0.376899,-0.957272
62051,0.634771,-0.962234,-0.369604,0.717886,0.399375,-0.136808,-1.160870,1.084916
62052,0.634771,2.173074,1.182785,-1.286584,-0.691528,0.319245,-0.609594,2.786739


In [12]:
scaled_test = scaler.transform(X_test)
X_test = pd.DataFrame(scaled_test, columns=columns)

In [13]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-1.142425,-0.479879,-0.473097,0.279409,-0.691528,-0.701098,-1.015797,-0.872181
1,-0.253827,1.419394,-0.266111,1.031085,1.256513,0.167228,-0.377478,-0.276543
2,-0.846226,-0.962234,0.044366,0.655247,-0.691528,-0.122214,-0.493536,-0.872181
3,-0.550026,0.243654,-0.576589,0.216769,1.689411,-0.423818,3.481451,-0.701999
4,-0.253827,-1.384294,0.044366,-1.286584,-0.691528,-0.073569,-0.580580,0.489278
...,...,...,...,...,...,...,...,...
15509,0.930970,0.575273,1.079292,0.216769,3.464293,-0.286394,-1.044812,0.829642
15510,1.523368,0.997333,0.872307,0.467328,0.650456,0.434779,2.204813,0.744551
15511,0.634771,-0.087965,1.389770,-1.286584,-0.691528,-0.409224,-0.957768,-0.276543
15512,1.227169,1.690719,1.079292,0.843166,1.905860,0.283977,-0.029304,2.106009


In [14]:
tree = DecisionTreeClassifier(random_state=10)

In [15]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=10)

In [16]:
pred = tree.predict(X_test)

In [17]:
tree.score(X_train, y_train)

1.0

In [18]:
accuracy_score(y_test, pred)

0.9998710841820291

In [19]:
confusion_matrix(y_test, pred)

array([[10143,     0],
       [    2,  5369]])

In [20]:
forest = RandomForestClassifier(random_state=10)

In [21]:
forest.fit(X_train, y_train)

RandomForestClassifier(random_state=10)

In [22]:
pred_f = forest.predict(X_test)

In [23]:
accuracy_score(y_test, pred_f)

1.0

In [24]:
confusion_matrix(y_test, pred_f)

array([[10143,     0],
       [    0,  5371]])

In [None]:
svm_clf = SVC(random_state=10)
svm_clf.fit(X_train, y_train)

In [None]:
pred_s = svm_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred_s)

In [None]:
confusion_matrix(y_test, pred_s)

In [None]:
# the numbers before smote
num_before = dict(Counter(y))

#perform smoting

# define pipeline
over = SMOTE(sampling_strategy=0.8)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_smote, y_smote = pipeline.fit_resample(X, y)


#the numbers after smote
num_after =dict(Counter(y_smote))

In [None]:
num_before

In [None]:
num_after

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_train)

In [None]:
X_train = pd.DataFrame(scaled_data, columns=columns)

In [None]:
scaled_test = scaler.transform(X_test)
X_test = pd.DataFrame(scaled_test, columns=columns)

In [None]:
forest_s = RandomForestClassifier(random_state=5)

In [None]:
forest_s.fit(X_train, y_train)
pred_s = forest_s.predict(X_test)

In [None]:
accuracy_score(y_test, pred_s)

In [None]:
confusion_matrix(y_test, pred_s)

### Neural Network

In [25]:
import pandas
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [32]:
model = Sequential([
    Dense(8, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [33]:
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [34]:
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f97280e6400>

In [35]:
p = model.predict(X_test)

In [36]:
pred = np.round(p)

In [37]:
accuracy_score(y_test, pred)

0.9708650251385845

In [39]:
confusion_matrix(y_test, pred)

array([[9943,  200],
       [ 252, 5119]])

In [38]:
model.save("NN.h5")

In [40]:
model.save_weights("NN_Weight.h5")

### Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
log_reg = LogisticRegression(random_state=10)

In [None]:
log_reg.fit(X_train, y_train)
pred = log_reg.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

### Ensemble All

In [41]:
from sklearn.ensemble import VotingClassifier

In [45]:
voting_clf = VotingClassifier(
estimators=[('lr', log_reg), ('rf', forest),('dt', tree)], voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=10)),
                             ('rf', RandomForestClassifier(random_state=10)),
                             ('dt', DecisionTreeClassifier(random_state=10))])

In [46]:
pred = voting_clf.predict(X_test)

In [47]:
accuracy_score(y_test, pred)

0.9998710841820291

In [48]:
confusion_matrix(y_test, pred)

array([[10143,     0],
       [    2,  5369]])