### Import the dependecies 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Data collection and pre-processing 

In [2]:
data= pd.read_csv("data/diabetes.csv")

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.shape

print("There are", 768 ,"rows")
print("There are", 9 ,"columns")

There are 768 rows
There are 9 columns


### Get the statistical data

In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
data["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

0 -->Diabetic 

1 -->Non diabetic 

### separating the feature and targets 


In [9]:
X= data.drop(columns="Outcome")
Y= data["Outcome"]

### Data Standardization

In [10]:
scaler= StandardScaler()

In [11]:
scaler.fit(X)

In [12]:
standardized_data= scaler.transform(X)

In [13]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [14]:
X= standardized_data

In [15]:
Xtrain, Xtest, Ytrain, Ytest= train_test_split(X,Y, test_size=0.2 , stratify=Y,random_state=2)

**Stratify ensures the training and test sets have the same proportion of the feature of interest as in the original dataset. Doing this with the target variable ensures that the cross-validation result is a close approximation of generalization error.**

In [16]:
print(X.shape)
print(Xtrain.shape)
print(Xtest.shape)

(768, 8)
(614, 8)
(154, 8)


### Training the Model using the SVM 

In [17]:
from sklearn import svm

In [18]:
classifier= svm.SVC(kernel="linear")

In [19]:
classifier.fit(Xtrain,Ytrain)

### Model evaluation 

**Accuracy score**

In [20]:
Xtrain_predictions = classifier.predict(Xtrain)

In [21]:
## training data accuracy on the train data
accuracy_measure= accuracy_score(Xtrain_predictions,Ytrain)

In [22]:
print("The accuracy score of the training data", accuracy_measure)
print("it is beyond 75%, meaning out of 100 predictions its getting 79% correct")

The accuracy score of the training data 0.7866449511400652
it is beyond 75%, meaning out of 100 predictions its getting 79% correct


In [23]:
## accuracy test on the test data
Xtest_predictions= classifier.predict(Xtest)
accuracy_test= accuracy_score(Xtest_predictions,Ytest)

In [24]:
print("The accuracy score of the test data", accuracy_test)
print("it is beyond 75%, meaning out of 100 predictions its getting 77% correct")

The accuracy score of the test data 0.7727272727272727
it is beyond 75%, meaning out of 100 predictions its getting 77% correct


In [25]:
print(classification_report(Xtest_predictions, Ytest))

              precision    recall  f1-score   support

           0       0.91      0.78      0.84       117
           1       0.52      0.76      0.62        37

    accuracy                           0.77       154
   macro avg       0.71      0.77      0.73       154
weighted avg       0.82      0.77      0.79       154



### Logisitc Regression

In [26]:
from sklearn.linear_model import LogisticRegression
LogReg= LogisticRegression()

In [27]:
LogReg.fit(Xtrain, Ytrain)

In [28]:
Log_Xtrain_pred= LogReg.predict(Xtrain)

In [29]:
accuracy_measure1= accuracy_score(Log_Xtrain_pred,Ytrain)
print("The accuracy score of the training data", accuracy_measure1)
print("it is beyond 75%, meaning out of 100 predictions its getting 79% correct")

The accuracy score of the training data 0.7850162866449512
it is beyond 75%, meaning out of 100 predictions its getting 79% correct


**We still observe that the accuracy between SVM model and Logistic model is the same. We have to find a way to deal with the class imbalance in the data**

### Dealing with class imbalance 

In [30]:
# !pip install imblearn



In [31]:
from imblearn.over_sampling import SMOTE

In [38]:
# Use SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(Xtrain, Ytrain)


In [40]:
classifier.fit(X_train_resampled, y_train_resampled)

In [43]:
# Make predictions on the test data
y_pred = classifier.predict(Xtest)

In [44]:
# Print the classification report
print(classification_report(Ytest, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       100
           1       0.64      0.65      0.64        54

    accuracy                           0.75       154
   macro avg       0.72      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154



## Making a predictive system

In [32]:
# we want to see if our system will predict correctly 
input_data= (1,89,66,23,94,28.1,0.167,21)

## we need to convert the input_data to a numpy array 

input_data_as_numpy_array= np.asarray(input_data)

## reshaping the instance for one cause we are only doing it for one instance not a whole dataset as we are predicting
input_data_reshape= input_data_as_numpy_array.reshape(1,-1)

## we still need to standardize the input data
std_data= scaler.transform(input_data_reshape)

print(std_data)

[[-0.84488505 -0.99820778 -0.16054575  0.15453319  0.12330164 -0.49404308
  -0.92076261 -1.04154944]]




**Lets make a prediction**

In [33]:
prediction= classifier.predict(std_data)
print(prediction)

if prediction[0]== 0:
    print("The person is Non-diabetic")
else:
    print("The person is Diabetic")

[0]
The person is Non-diabetic


**WE CAN SEE OUR MODEL HAS PREDICTED CORRECTLY**

In [34]:
## Trying another input 
# we want to see if our system will predict correctly 
input_data= (2,197,70,45,543,30.5,0.158,53)

## we need to convert the input_data to a numpy array 

input_data_as_numpy_array= np.asarray(input_data)

## reshaping the instance for one cause we are only doing it for one instance not a whole dataset as we are predicting
input_data_reshape= input_data_as_numpy_array.reshape(1,-1)

## we still need to standardize the input data
std_data= scaler.transform(input_data_reshape)

print(std_data)

prediction= classifier.predict(std_data)
print(prediction)

if prediction[0]== 0:
    print("The person is Non-diabetic")
else:
    print("The person is Diabetic")

[[-0.54791859  2.38188392  0.04624525  1.53455054  4.02192191 -0.18943689
  -0.94794368  1.68125866]]
[1]
The person is Diabetic




# We are going to create a web 

In [35]:
import pickle

In [36]:
filename= "trained_model.sav"
pickle.dump(classifier, open(filename, "wb"))

## wb- write binary 

In [37]:
## loading the saved model 

loaded_model= pickle.load(open("trained_model.sav", "rb"))