Importing the Dependencies

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

In [3]:
#loading the datasets into pandas dataframe
diabetes_dataset = pd.read_csv(r'C:\Users\kenneth\OneDrive\Desktop\Disease Prediction\Disease-Prediction\datasets\diabetes_prediction_dataset.csv')

In [4]:
#first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [5]:
#last 5 rows of the dataset
diabetes_dataset.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0
99999,Female,57.0,0,0,current,22.43,6.6,90,0


In [6]:
#knowing more about the dataset
diabetes_dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [7]:
#shape of the dataset
diabetes_dataset.shape

(100000, 9)

In [8]:
#how many diabetic and how many non diabetic features
diabetes_dataset['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [9]:
diabetes_dataset.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

In [10]:
# converting the categorical values to numeric values
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
from pandas.core.dtypes.common import is_numeric_dtype #used to check if the particular column's dtype is numeric
for column in diabetes_dataset.columns:
    if is_numeric_dtype(diabetes_dataset[column]):
        continue
    else:
        diabetes_dataset[column]=label.fit_transform(diabetes_dataset[column]) #converts the other dtype into numeric

In [11]:
#seperating the data and labels
X = diabetes_dataset.drop(columns = 'diabetes', axis = 1)
Y = diabetes_dataset['diabetes']

In [12]:
print(X)

       gender   age  hypertension  heart_disease  smoking_history    bmi  \
0           0  80.0             0              1                4  25.19   
1           0  54.0             0              0                0  27.32   
2           1  28.0             0              0                4  27.32   
3           0  36.0             0              0                1  23.45   
4           1  76.0             1              1                1  20.14   
...       ...   ...           ...            ...              ...    ...   
99995       0  80.0             0              0                0  27.32   
99996       0   2.0             0              0                0  17.37   
99997       1  66.0             0              0                3  27.83   
99998       0  24.0             0              0                4  35.42   
99999       0  57.0             0              0                1  22.43   

       HbA1c_level  blood_glucose_level  
0              6.6                  140  
1  

In [13]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


In [14]:
diabetes_dataset.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Data Standardisation


In [15]:
scaler = StandardScaler()

In [16]:
scaler.fit(X)
standardized_data = scaler.transform(X)

In [17]:
X = standardized_data

In [18]:
print(X)

[[-8.41046744e-01  1.69270354e+00 -2.84439447e-01 ... -3.21055792e-01
   1.00170572e+00  4.77042159e-02]
 [-8.41046744e-01  5.38006427e-01 -2.84439447e-01 ... -1.15583678e-04
   1.00170572e+00 -1.42620999e+00]
 [ 1.18723364e+00 -6.16690686e-01 -2.84439447e-01 ... -1.15583678e-04
   1.61108022e-01  4.89878478e-01]
 ...
 [ 1.18723364e+00  1.07094356e+00 -2.84439447e-01 ...  7.67292549e-02
   1.61108022e-01  4.16182767e-01]
 [-8.41046744e-01 -7.94336396e-01 -2.84439447e-01 ...  1.22036126e+00
  -1.42668764e+00 -9.34905254e-01]
 [-8.41046744e-01  6.71240710e-01 -2.84439447e-01 ... -7.36921977e-01
   1.00170572e+00 -1.18055762e+00]]


In [19]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64


Splitting the data into training and testing data

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, stratify = Y, random_state = 2)

In [21]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(80000, 8) (20000, 8) (80000,) (20000,)


Training the model

In [22]:
model = svm.SVC(kernel = 'linear')
model.fit(X_train,Y_train)

Model Evaluation

In [23]:
X_train_prediction = model.predict(X_train)
training_accuracy = accuracy_score(X_train_prediction, Y_train)
print("The accuracy score of the model is: ", training_accuracy)

The accuracy score of the model is:  0.960475


In [24]:
X_test_prediction = model.predict(X_test)
testing_prediction = accuracy_score(X_test_prediction, Y_test)
print("The accuracy score of the model using testing data is: ", testing_prediction)

The accuracy score of the model using testing data is:  0.96195


In [25]:
#testing the model with some data from the dataset
input_data = (0,59.0,0,0,4,33.86,5.7,85)

#changing the input_data to numpyarray
input_data_as_numpy_array = np.asarray(input_data)

#reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

#predicting the desired output
prediction = model.predict(std_data)

print(prediction)

[[-0.84104674  0.76006356 -0.28443945 -0.20257766  0.96332671  0.98530646
   0.16110802 -1.30338381]]
[0]




In [26]:
if prediction[0] == [0]:
    print("Not diabetic")
else:
    print("Diabetic")

Not diabetic
