## Necesssary Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm

sns.set()

## Loading the Data

In [None]:
data=pd.read_csv('diabetes.csv')
data.head()

In [None]:
data.describe(include='all')

## Plotting each Feature with respect to the Outcome

In [None]:
for i in range(len(data.columns)):
    if data.columns[i] != 'Outcome':
        f, ax = plt.subplots(sharey = True,figsize = (5,5))
        ax.scatter(data[data.columns[i]], data['Outcome'])
        ax.set_title('{} and Outcome'.format(data.columns[i]))
        plt.show()

## Setting the Target value and the Input features

In [None]:
targets = data['Outcome']
inputs = data.drop(['Outcome'], axis=1)

## Test and Train Split

In [None]:
x_tr, x_te, y_tr, y_te = train_test_split(inputs, targets, test_size= 0.2, random_state= 365)

## Obtaining the statistics of the data using the statsmodels 

In [None]:
input_scaled = sm.add_constant(inputs)
reg_log = sm.Logit(targets, input_scaled)
results_log = reg_log.fit()
results_log.summary()

## Performing the Regression using sklearn

In [None]:
LogReg = LogisticRegression(max_iter = 1000)
LogReg.fit(x_tr, y_tr)

## Testing the Model

In [None]:
y_pred = LogReg.predict(x_te)
pd.DataFrame({'Y test':y_te, 'Y predicted':y_pred}).head(n=20)

## Obtaining the accuracy of the model

In [None]:
cnf_mat = metrics.confusion_matrix(y_te, y_pred)
print(metrics.accuracy_score(y_te, y_pred))

In [None]:
sns.heatmap(pd.DataFrame(cnf_mat), annot=True,cmap='coolwarm',fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Diabetes Confusion Matrix', y=1)
plt.ylabel('Real Values')
plt.xlabel('Predicted Values')

## Checking the Error Distribution

In [None]:
sns.distplot(y_te-y_pred, hist=None)