# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import warnings
warnings.filterwarnings('ignore')

# EDA

In [None]:
# Read CSV Data
data = pd.read_csv("diabetes.csv")

In [None]:
data.head(10)

## Summary of the Diabetes Dataset

This dataset aims to aid in the diagnosis of diabetes by providing a collection of diagnostic measurements for 768 patients. Each row represents a single patient, and the features describe various health factors potentially related to diabetes.

**Attributes:**

* **Pregnancies:** Number of times a woman has been pregnant
* **Glucose:** Blood glucose level
* **BloodPressure:** Blood pressure measurement in mm Hg
* **SkinThickness:** Skin thickness (mm)
* **Insulin:** Blood insulin level (μU/mL)
* **BMI:** Body mass index (calculated from weight and height)
* **DiabetesPedigreeFunction:** Function score based on family history of diabetes
* **Age:** Patient's age (years)
* **Outcome:** Indicates presence of diabetes (1 - Yes, 0 - No)


In [None]:
# number of row's and col in data set 
row,col=data.shape
print("Number of Row's in Data :",row)
print("Number of Col's in Data :",col)

In [None]:
# Check for duplicates and remove them if found
if data.duplicated().sum() > 0:
    print("Found duplicate rows. Removing them...")
    data = data.drop_duplicates()

In [None]:
# Check for missing values
missing_values = data.isna().sum()
if missing_values.any():
    print(f"Missing values found in {missing_values.index.tolist()}. Consider imputation techniques.")

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data["Outcome"].value_counts()

# Visualization

In [None]:
# Plotting For Diabetes Status
labels = ['Non-Diabetic', 'Diabetic']
plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1)
sns.countplot(data=data, x='Outcome', hue='Outcome', alpha=1)
plt.legend(labels)
plt.title('Histogram For Diabetes Status', size=15)

plt.subplot(1, 2, 2)  
y = data['Outcome'].value_counts()
explode = [0, 0.02]  
plt.pie(y, labels=labels, explode=explode, autopct='%.2f%%')
plt.axis('equal')
plt.legend(labels)
circle = plt.Circle(xy=(0, 0), radius=0.75, facecolor='white', edgecolor='black')
plt.gca().add_artist(circle)
plt.title('Pie Chart for Diabetes Status', size=15)
plt.show()

#### By observing the charts, we can confirm that the dataset contains a higher proportion of Non-Diabetic patients (500, corresponding to 65.10% as displayed on the chart) compared to Diabetic patients (268, corresponding to 34.90%). This initial exploration highlights a potential class imbalance within the data.

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(data['Age'],kde=False,color='skyblue')
plt.title('Histogram For Age', size=15)
plt.show()

#### By observing the histogram, we can see a right-skewed distribution. The x-axis represents age groups, and the y-axis indicates the number of patients in each age group. This skewness suggests a higher concentration of individuals in younger age groups, particularly between 21 and 28 years old. The number of patients seems to decrease in older age categories.

In [None]:
# Function for Stacked Histograms with Outcome Coloring
def plot_stacked_histograms(data, features, figsize=(15, 12)):
 
  num_features = len(features)
  num_rows = (num_features + 3) // 3  

  plt.figure(figsize=figsize)
  for i, feature in enumerate(features):
    row_num = (i // 3) + 1  
    col_num = (i % 3) + 1  

    plt.subplot(num_rows, 3, (row_num - 1) * 3 + col_num)
    sns.histplot(data=data, x=feature, hue="Outcome", multiple="stack", kde=True)
    plt.title(f"{feature} by Outcome")  

  plt.tight_layout()
  plt.show()

### Visualizing Feature Distributions by Outcome:

In [None]:
# Calling the function with our data and list of features
features = ['Age', 'DiabetesPedigreeFunction', 'BMI', 'Insulin', 
            'SkinThickness', 'BloodPressure', 'Glucose']
plot_stacked_histograms(data, features)

#### This section employs stacked histograms to visually represent the distribution of various features within the dataset. Each histogram is overlaid with different colors to distinguish between patients classified as diabetic and non-diabetic.

#### By analyzing these stacked histograms, we can observe how the distribution of each feature (e.g., 'Age', 'BMI') varies across the two outcome groups. This visualization technique allows us to identify potential patterns or relationships between feature values and the presence or absence of diabetes.

In [None]:
# Function for scatter plot to show the distribution of the age VS other features
def plot_scatter_age_distribution(data, features, figsize=(10, 30)):
   
    num_features = len(features)
    num_rows = num_features 

    plt.figure(figsize=figsize)
    for i, feature in enumerate(features):
        plt.subplot(num_rows, 1, i + 1)  
        sns.scatterplot(data=data, x=feature, y="Age")  
        plt.title(f"Age Distribution VS {feature}")

    plt.tight_layout()
    plt.show()

### Visualizing Feature Distributions by Age:

In [None]:
# Calling the function with our data and list of features
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction']  
plot_scatter_age_distribution(data, features)

#### Here, we can observe the relationships between age and various features in the dataset using scatter plots. These plots allow us to visualize how age is distributed for different values of each feature. For instance, the scatter plot for 'BloodPressure' might reveal if there's a trend of higher or lower blood pressure at different age groups.

#### By analyzing these scatter plots, we can gain insights into potential correlations or patterns between age and other health factors.

In [None]:
# Correlation Matrix
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Training And Testing Sets

In [None]:
# Splitting Features and Target Variable
y = data['Outcome'] # Depended
x = data.drop('Outcome', axis=1) # independents

*  Splitting Data for Training and Testing:

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

# Feature Scaling

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
print('Length of Training Set :',len(x_train))
print('Length of Testing Set :',len(x_test))

# Model Execution:

### K-Nearest Neighbors (KNN)

#### Importance of Hyperparameter Tuning (K):

In K-Nearest Neighbors (KNN), the `n_neighbors` parameter (denoted by K) significantly impacts model performance. Choosing the optimal K value is crucial for achieving the best accuracy. It's tempting to try a few K values manually, like I did with values like 1, 3, and 5. However, this approach might not always lead to the best outcome. In my case, I initially guessed 5 as the best K, but it only resulted in an accuracy of around 74%.

A more systematic approach, like hyperparameter tuning with cross-validation, can help identify a potentially better K value. This approach is especially beneficial for datasets where the optimal K might not be readily apparent.

* This section outlines the process of hyperparameter tuning for K using cross-validation **:**

In [None]:
k_values = [i for i in range (1,40)]
scores = []

for k in k_values:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn_model, x, y, cv=10)
    scores.append(np.mean(score))

In [None]:
# Visualize K vs Accuracy
sns.lineplot(x = k_values, y = scores, marker = 'o')
plt.xlabel("K Values")
plt.ylabel("Accuracy Score")
plt.show()

In [None]:
best_index = np.argmax(scores)
best_k = k_values[best_index]

knn_model = KNeighborsClassifier(n_neighbors=best_k)
knn_model.fit(x_train, y_train)

# Model evaluation

In [None]:
y_pred = knn_model.predict(x_test) 
knn_accuracy = accuracy_score(y_test, y_pred)
knn_accuracy

In [None]:
# Evaluate The Model
Training_score = accuracy_score(y_train, knn_model.predict(x_train))  
Testing_score = accuracy_score(y_test, y_pred)

# Precision, Recall, F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Classification Report
class_names = ['Non-Diabetic', 'Diabetic']  
report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)

# Print the relevant metrics
print('Training Score (Accuracy):', Training_score)
print('Testing Score (Accuracy):', Testing_score)
print('Precision:', precision)
print('Recall:', recall)
print('F1-Score:', f1)

# Print detailed Classification Report
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='Blues', linewidths=.5)
plt.title('Classification Report for KNN')
plt.show()

In [None]:
confusion_matrix = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["Non-Diabetic", "Diabetic"])
cm_display.plot()
plt.show()

In [None]:
#loop data use model to predict the outcome
def predict_diabetes(data, model):
    data = scaler.transform(data)
    prediction = model.predict(data)
    return prediction

# Create a sample data

sample_data = [[6, 148, 72, 35, 0, 33.6, 0.627, 50],
               [1, 85, 66, 29, 0, 26.6, 0.351, 31],
                [8, 183, 64, 0, 0, 23.3, 0.672, 32],
                [1, 89, 66, 23, 94, 28.1, 0.167, 21],
                [0, 137, 40, 35, 168, 43.1, 2.288, 33]]

# Predict the outcome
predictions = predict_diabetes(sample_data, knn_model)
for i, prediction in enumerate(predictions):
    print(f"Prediction for Sample {i+1}: {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")
    
# Save the model
import joblib
joblib.dump(knn_model, 'knn_diabetes_model.pkl')
print("Model Saved Successfully!")

# Load the model
loaded_model = joblib.load('knn_diabetes_model.pkl')

# Predict the outcome
predictions = predict_diabetes(sample_data, loaded_model)
for i, prediction in enumerate(predictions):
    print(f"Prediction for Sample {i+1}: {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")
    
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
print("Scaler Saved Successfully!")


In [50]:
import joblib

sample_data = [[6, 148, 72, 35, 0, 33.6, 0.627, 50],
               [1, 85, 66, 29, 0, 26.6, 0.351, 31],
                [8, 183, 64, 0, 0, 23.3, 0.672, 32],
                [1, 89, 66, 23, 94, 28.1, 0.167, 21],
                [0, 137, 40, 35, 168, 43.1, 2.288, 33]]

loaded_model = joblib.load('knn_diabetes_model.pkl')

predictions = predict_diabetes(sample_data, loaded_model)
for i, prediction in enumerate(predictions):
    print(f"Prediction for Sample {i+1}: {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")

Prediction for Sample 1: Diabetic
Prediction for Sample 2: Non-Diabetic
Prediction for Sample 3: Diabetic
Prediction for Sample 4: Non-Diabetic
Prediction for Sample 5: Diabetic


In [52]:
import joblib
import numpy as np
import warnings

warnings.filterwarnings("ignore", message="Trying to unpickle estimator.*")

def predict_loan_approval(pregnancies: int, glucose: int, bloodPressure: int, skinthickness: int, insulin: int, diabetespedigreefunction: float, bmi: float, age: int) -> str:
    # Load the trained model
    loaded_model = joblib.load('knn_diabetes_model.pkl')

    # Prepare the new data for prediction
    new_data = np.array([[pregnancies, glucose, bloodPressure, skinthickness, insulin, bmi, diabetespedigreefunction, age]])

    prediction = loaded_model.predict(new_data)

    # Return prediction as "Yes" or "No"
    return "Yes" if prediction[0] == 1 else "No"

predict_loan_approval (6, 148, 72, 35, 0, 33.6, 0.627, 50)


'Yes'