In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Data preparation, model building and accuracy checking libraries
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Importing the dataset:
df_diabetes = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df_diabetes.head()

In [None]:
#Checking the shape of the dataset:
df_diabetes.shape

In [None]:
#Checking for zero values where it's not expected:
def check_zeroes(dataframe):
    for column in dataframe:
        if column == 'Pregnancies' or column == 'Outcome':
            continue
        try:
            print(column, ":", dataframe[column].value_counts()[0])
        except KeyError:
            print(column, ":", 0)
            
check_zeroes(df_diabetes)

In [None]:
#Checking for percentage of zero values:
def check_zeroes_pct(dataframe):
    for column in dataframe:
        if column == 'Pregnancies' or column == 'Outcome':
            continue
        try:
            print(column, ": {}%".format(round((dataframe[column].value_counts()[0] / dataframe[column].value_counts().sum())*100 , 2)))
        except KeyError:
            print(column, ": {}%".format(0))
            
check_zeroes_pct(df_diabetes)

**We notice a high percentage of missing values in important features for diabetes prediction such as insulin and skin thickness. 
Since these features are usually important predictors of presence of diabetes, we cannot remove them despite high percentages of missing values in them.
Instead, we will remove the rows that contain the missing data.
This will significantly reduce the size of the dataset yet it will likely boost the accuracy of the model.**

In [None]:
#Removing the rows that contain zero values from Insulin column:
diabetes = df_diabetes.copy()
diabetes = diabetes[diabetes.Insulin > 0]
diabetes.shape

In [None]:
#Checking zero values after dropping the rows that contained zero in the 'Insulin' column:
check_zeroes(diabetes)

In [None]:
#Replacing the zero values in the Glucose and BMI columns with their respective column medians:
diabetes['Glucose'] = diabetes['Glucose'].replace(to_replace = [0], value = [np.median(diabetes.Glucose)])
diabetes['BMI'] = diabetes['BMI'].replace(to_replace = [0], value = [np.median(diabetes.BMI)])

In [None]:
#Checking zero values after replacing the zero values in the Glucose and BMI columns with their respective column medians:
check_zeroes(diabetes)

In [None]:
#Understanding the distribution of all the features in the dataset:
fig, ax = plt.subplots(nrows = 3, ncols = 3, figsize = (15,10))
for column, subplot in zip(diabetes, ax.flatten()):
    sns.histplot(x = diabetes[column], kde = True, ax = subplot)

fig.suptitle('Distributions of all features', fontsize = 18)
fig.tight_layout()
plt.show()

In [None]:
#Plotting the predictor features against the target variable to check for correlations:
fig, ax = plt.subplots(nrows = 4, ncols = 2, figsize = (15,20))
for column, subplot in zip(diabetes, ax.flatten()):
    if column == 'Outcome':
        continue
    sns.boxplot(x = diabetes.Outcome, y = diabetes[column], ax = subplot)
    
fig.tight_layout()
plt.show()

In [None]:
#Plotting a correlation matrix:
corr = diabetes.corr()
fig, ax = plt.subplots(figsize = (12,6))
sns.heatmap(corr, annot = True, cmap = 'Blues', linewidths = 0.5, ax = ax)
plt.show()

In [None]:
#Splitting the dataset into dependent and independent features:
y = diabetes.Outcome
x = diabetes.drop(['Outcome'], axis = 1)

In [None]:
#Scaling the independent features:
scaler = StandardScaler()
scaler.fit(x)
X = scaler.fit_transform(x)

In [None]:
#Finding the accuracies on different random states during train test splitting since the dataset is quite small and different samples can lead to significantly different accuracy scores:

accuracy_scores = []
seeds = range(0,101)

for seed in seeds:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    Logit_Model = LogisticRegression()
    Logit_Model.fit(X_train,y_train)
    Logit_Prediction = Logit_Model.predict(X_test)
    Logit_Score = accuracy_score(y_test,Logit_Prediction)
    accuracy_scores.append(Logit_Score)

In [None]:
#Plotting the accuracies of the model on different random states:
plt.plot(seeds, accuracy_scores)
plt.title("Accuracy of the model for different random states")
plt.ylabel('Accuracy Score')
plt.xlabel('Random State')
plt.show()

In [None]:
#Finding the random state that offers the highest accuracy:
maxAcc = max(accuracy_scores)
maxAccState = accuracy_scores.index(maxAcc)
print("The highest accuracy is:", maxAcc)
print("The random state that offers the highest accuracy is:", maxAccState)

In [None]:
#Splitting the dataset into training and testing set based on the best random state found:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = maxAccState)

In [None]:
#Fitting the data on the logistic regression model and making predictions:
Logit_Model = LogisticRegression()
Logit_Model.fit(X_train,y_train)
Logit_Prediction = Logit_Model.predict(X_test)

In [None]:
#Defining a function for plotting confusion matrix:
def plot_confusion_matrix(y_test, y_pred, model_name):
    cm = confusion_matrix(y_test, y_pred)
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cbar = False, linewidths = 0.1, annot_kws = {'size':25})
    plt.xticks(fontsize = 15)
    plt.yticks(fontsize = 15)
    plt.title("Confusion matrix for " + model_name, fontsize = 18)
    plt.show()

In [None]:
#Plotting confusion matrix:
plot_confusion_matrix(y_test, Logit_Prediction, "Logistic Regression")

In [None]:
#Generating accuracy score and classification report:

Logit_Score = accuracy_score(y_test,Logit_Prediction)
Logit_Report = classification_report(y_test,Logit_Prediction)

In [None]:
#Printing the accuracy scores:
print("Logistic Regression Score:", Logit_Score)

In [None]:
#Printing the classification reports:
print("Logistic Regression Classification Report:\n\n", Logit_Report)