In [1]:
import pandas as pd                                                                              
import numpy as np                                                                               
pd.set_option('display.width', 1000)                                                    
from sklearn.model_selection import train_test_split                                 
from sklearn.preprocessing import StandardScaler                                      
from sklearn.naive_bayes import GaussianNB                                        
from sklearn.tree import DecisionTreeClassifier                                             
from sklearn.neighbors import KNeighborsClassifier                                
from sklearn.svm import SVC                                                
from sklearn.linear_model import LogisticRegression                                    
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
                                                                              
                                                                                        
data = pd.read_csv('diabetes.csv')                                            
print('Dataset used: Pima Indians Diabetes Dataset')
print('Number of instances in dataset:', len(data))
print('Number of attributes in dataset:', len(data.columns) -1)
                                                                                      
print(data.head())                                                                       

Dataset used: Pima Indians Diabetes Dataset
Number of instances in dataset: 768
Number of attributes in dataset: 8
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72             35        0  33.6                     0.627   50        1
1            1       85             66             29        0  26.6                     0.351   31        0
2            8      183             64              0        0  23.3                     0.672   32        1
3            1       89             66             23       94  28.1                     0.167   21        0
4            0      137             40             35      168  43.1                     2.288   33        1


In [2]:
def clean_data(data):                                                    
                                               
    X = data.drop('Outcome', axis=1)                             
    y = data['Outcome']                                                        

    X.fillna(X.mean(), inplace=True)                                       
    
    lower_limit = X.quantile(0.05)                                                     
    upper_limit = X.quantile(0.95)
                                                      
    X = X.apply(lambda x: x.clip(lower=lower_limit[x.name], upper=upper_limit[x.name]), axis=0) 
                                                                                     
    return X, y

                                                                            
X_cleaned, y_cleaned = clean_data(data)
                                                                                     
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

                                                                                                       
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
nb_model = GaussianNB()                         
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Classification Report")
print(classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb))
print("Recall:", recall_score(y_test, y_pred_nb))

                                           
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree Classification Report")
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))

                                                     
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print("K-Nearest Neighbors Classification Report")
print(classification_report(y_test, y_pred_knn))
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Recall:", recall_score(y_test, y_pred_knn))

                                                      
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("Support Vector Machine Classification Report")
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))

                                     
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_scaled, y_train)
y_pred_logreg = logreg_model.predict(X_test_scaled)
print("Logistic Regression Classification Report")
print(classification_report(y_test, y_pred_logreg))
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Precision:", precision_score(y_test, y_pred_logreg))
print("Recall:", recall_score(y_test, y_pred_logreg))


Naive Bayes Classification Report
              precision    recall  f1-score   support

           0       0.81      0.75      0.78        99
           1       0.60      0.69      0.64        55

    accuracy                           0.73       154
   macro avg       0.71      0.72      0.71       154
weighted avg       0.74      0.73      0.73       154

Accuracy: 0.7272727272727273
Precision: 0.6031746031746031
Recall: 0.6909090909090909
Decision Tree Classification Report
              precision    recall  f1-score   support

           0       0.79      0.81      0.80        99
           1       0.64      0.62      0.63        55

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154

Accuracy: 0.7402597402597403
Precision: 0.6415094339622641
Recall: 0.6181818181818182
K-Nearest Neighbors Classification Report
              precision    recall  f1-score   support

       