<a href="https://colab.research.google.com/github/Helazr/coursera-test/blob/main/Diabetes_patients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing the libraries**


In [3]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Convolution2D
from keras.layers.convolutional import MaxPooling2D
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras import backend as K
import tensorflow as tf
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
#plt.style.use('ggplot')
#ggplot is R based visualisation package that provides better graphics with higher level of abstraction

# **Import dataset**

In [4]:
import pandas as pd
data = pd.read_csv("/content/diabetes.csv")
data

FileNotFoundError: ignored

# **Perform initial analysis**








In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
#Find the duplicates

data.duplicated().sum()

In [None]:
#Find null values
data.isnull().sum()

**number of people with diabetes**

In [None]:
## null count analysis
import missingno as msno
p=msno.bar(data)

In [None]:
#Outcome distribution
## checking the balance of the data by plotting the count of outcomes by their value
color_wheel = {1: "#0392cf", 
               2: "#7bc043"}
colors = data["Outcome"].map(lambda x: color_wheel.get(x + 1))
print(data.Outcome.value_counts())
p=data.Outcome.value_counts().plot(kind="bar")

In [None]:
#Correlation 

data.corr(method='kendall')
print(data.corr(method='kendall')["Outcome"].abs().sort_values(ascending=False))

In [None]:
#Data scaling

from sklearn.preprocessing import StandardScaler

x = data.drop(['Outcome','DiabetesPedigreeFunction','BloodPressure'], axis = 1)
y = data['Outcome'] 

scaler = StandardScaler()
scaler.fit(x)
standardized_data = scaler.transform(x)

X = standardized_data
Y = data['Outcome']

**Scatter matrix of uncleaned data**

**Heatmap for unclean data**

In [None]:
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(data.corr(), annot=True,cmap ='RdYlGn')  # seaborn has very simple solution for heatmap

**Heatmap for clean data**

In [None]:
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(data.corr(), annot=True,cmap ='RdYlGn')  # seaborn has very simple solution for heatmap

In [None]:
# Drop rows with None/NaN values
df1 = data[data.Insulin.notnull()]
print(df1)

In [None]:
#Drop the columns where at least one element is missing.
data.dropna(axis='columns')

# **Data preprocessing**

## **Split data X and Y**

In [None]:
X = data.drop(['Outcome'] , axis=1)
X

In [None]:
Y = data.drop(X, axis = 1)
Y

## **Split data as train and test**


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.33, random_state=200)

In [None]:
Y_train

In [None]:
Y_test

In [None]:
X_train

In [None]:
X_test

**Model Building**

# **kNN**

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X2 =  pd.DataFrame(sc_X.fit_transform(data.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

In [None]:
X2.head()

In [None]:
Y2 = data.Outcome

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split
X2_train,X2_test,Y_train,Y_test = train_test_split(X2,Y2,test_size=1/3,random_state=42, stratify=Y2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X2_train,Y_train)
    
    train_scores.append(knn.score(X2_train,Y_train))
    test_scores.append(knn.score(X2_test,Y_test))

In [None]:
## score that comes from testing on the same datapoints that were used for training
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))

In [None]:
## score that comes from testing on the datapoints that were split in the beginning to be used for testing solely
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))

**result visualisation** 

```
# Ce texte est au format code
```



In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')

**confusion matrix**

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(X2_test)
confusion_matrix(Y_test,y_pred)
pd.crosstab(Y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
y_pred = knn.predict(X2_test)
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(Y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

**Classification Report**

**Precision Score**

In [None]:
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred))

 # **SVM**

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X2_train, Y_train)

In [None]:
Y_train_pred = clf.predict(X2_train)
Y_test_pred = clf.predict(X2_test)
Y_train_pred

In [None]:
Y_test_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_test_pred)

# **Building the DFF model**

In [None]:
model = Sequential()
#input layer
model.add(Dense(12, activation='relu', input_shape=(8,))),
# Disable 10% of the neurons on each iteration
Dropout(0.1),
# Adding the second hidden layer (with dropout)
model.add(Dense(8, activation='relu')),
# Disable 10% of the neurons on each iteration
Dropout(0.1),
# Adding the output layer
model.add(Dense(1, activation='softmax'))

In [None]:
model.summary()

## **compile the model**

In [None]:
model.compile(optimizer = 'adam' , loss = 'binary_crossentropy' ,metrics = ['accuracy'])

## **Train the model**

In [None]:
#convert X train Y_train to array
X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)

In [None]:
history = model.fit(X_train ,
                    Y_train ,
                    epochs=50 )

## **Summarize history for accuracy**

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (20,5))
plt.subplot(1,2,1)
plt.title("Train and Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.plot(history.history['loss'],label="Train Loss")
plt.legend()

plt.subplot(1,2,2)
plt.title("Train and Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.plot(history.history['accuracy'], label="Train Accuracy")
plt.legend()
plt.tight_layout()

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## **Evaluate the model**

In [None]:
model.evaluate(
  X_test,
  Y_test)