<a href="https://colab.research.google.com/github/H3nr7M/Save_ML_model/blob/main/SaveMLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing the Dependencies

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

##Data Collection and Analysis


In [12]:
# loading the diabetes dataset to a pandas DataFrame
!git clone https://github.com/H3nr7M/SaveMLmodel.git

fatal: destination path 'SaveMLmodel' already exists and is not an empty directory.


In [13]:
diabetes_dataset = pd.read_csv('/content/SaveMLmodel/diabetes.csv') 

##Preprocesing

###Manage missings values

In [14]:
diabetes_dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

###Label encoding

In [15]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


###Handle inbalanced datasets

In [16]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [17]:
# separating the legit and fraudulent transactions
fine = diabetes_dataset[diabetes_dataset.Outcome == 0]
wrong = diabetes_dataset[diabetes_dataset.Outcome == 1]

In [18]:
fine_sample = fine.sample(n=268)

In [19]:
new_dataset = pd.concat([fine_sample, wrong], axis = 0)
new_dataset['Outcome'].value_counts()

0    268
1    268
Name: Outcome, dtype: int64

In [20]:
diabetes_dataset=new_dataset

###Split our datase

0 --> Non-Diabetic

1 --> Diabetic

In [21]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

Train Test Split

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

###Standardization of the data

In [23]:
print(diabetes_dataset.std())

Pregnancies                   3.443325
Glucose                      32.664122
BloodPressure                19.831531
SkinThickness                16.238633
Insulin                     122.480638
BMI                           8.042247
DiabetesPedigreeFunction      0.346011
Age                          11.612377
Outcome                       0.500467
dtype: float64


In [24]:
scaler = StandardScaler()

In [25]:
scaler.fit(X_train)

StandardScaler()

In [26]:
X_train_standardized = scaler.transform(X_train)
X_test_standardized = scaler.transform(X_test)

In [27]:
print(X_train_standardized.std())
print(X_test_standardized.std())

1.0
1.0139575334832214


In [28]:
X_train=X_train_standardized
X_test=X_test_standardized

##Training the Model

In [29]:
classifier = svm.SVC(kernel='linear')

In [30]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

SVC(kernel='linear')

Model Evaluation

Accuracy Score

In [31]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [32]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7733644859813084


In [33]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [34]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7129629629629629


Making a Predictive System

In [35]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic


###Saving the trained model

In [36]:
import pickle

In [37]:
filename = 'trained_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [38]:
# loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [39]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic
