## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import scikitplot as skplt

## Loading Data

In [None]:
reading_data = "C:/Users/ADMIN/Desktop/Datasets/diabetes.csv"
data = pd.read_csv(reading_data)

## Exploring data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
# Relating BMI with outcome
sns.histplot(data=data, x="BMI", bins=20, hue="Outcome", multiple="stack", element="step")

In [None]:
# BMI vs Pedigree Function
sns.set_theme()
sns.scatterplot(data=data, x = "Age", y="BMI", hue="DiabetesPedigreeFunction")

In [None]:
# Histogram plot for BMI
sns.histplot(data=data, x="BMI", kde=True)

In [None]:
# Boxplot for BMI
sns.boxplot(data=data, x="BMI")

In [None]:
# Removing Outliers
min_threshold, max_threshold= data.BMI.quantile([0.02, 0.99])
min_threshold, max_threshold
type(min_threshold)

In [None]:
data[data.BMI>max_threshold]

In [None]:
data = data[(data.BMI>min_threshold) & (data.BMI<max_threshold)]
data.shape

In [None]:
sns.histplot(data=data, x="BMI")

In [None]:
#Checking total OUTCOME values
sns.countplot(data=data, x="Outcome")

## Train Data

In [None]:
# Decide Test Size
data["Is_train"] = np.random.uniform(0, 1, len(data))<=0.75
# View 5 top rows
data.head()

In [None]:
# Creating dataframes with data rows and training rows
train, test = data[data["Is_train"]==True], data[data["Is_train"]==False]
# Show the number of observations for the test and training dataframes
print("Number of observations in the training data", len(train))
print("Number of observations in the test data", len(test))

In [None]:
len(data)

In [None]:
# Create a list of the feature's column names
features = data.columns[:8]
# View features
features
y = train["Outcome"]
len(y)

In [None]:
# Creating a Random Forest Classifier
clf = RandomForestClassifier(n_jobs = 1, random_state =0)
# Training the classifier
clf.fit(train[features], train["Outcome"])

## Make Predictions

In [None]:
# Applying the training classifier to the test
preds = clf.predict(test[features])
# Viewing predictions
preds

In [None]:
# Viewing the PREDICTED Outcome for the first 10 observations
preds[:10]

In [None]:
# Viweing 10 results test dataframe
test.Outcome.head(10)


In [None]:
# Viewing 10 predictions' probabilities
clf.predict_proba(test[features])[0:10]

## Check Accuracy

In [None]:
accuracy_score(test["Outcome"], preds)*100

## Confusion Matrix

In [None]:
skplt.metrics.plot_confusion_matrix( test["Outcome"], preds)