In [1]:
import pandas as pd
import numpy as np


### **reading data**

In [2]:
dataset = pd.read_csv("diabetes.csv")
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## **divide the data into features and test**

In [3]:
diabetes_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction","Age"]
X = dataset[diabetes_features]
diabetes_output = ["Outcome"]
y = dataset[diabetes_output]


**Features**


In [4]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


**output**

In [5]:
y.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


### **Split the data**

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size=0.2)

**handlling the missing values**

In [7]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

### **Logistic regression model**

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
logistic_model = LogisticRegression(max_iter= 180)
logistic_model.fit(imputed_X_train, y_train.values.ravel())
X_validation_logistic = logistic_model.predict(imputed_X_test)
logistic_error = mean_absolute_error(y_test, X_validation_logistic)
print(logistic_error)

0.17532467532467533


### **KNN model**

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

#feature scaling 
X_scale = StandardScaler()
imputed_X_train = X_scale.fit_transform(imputed_X_train)
imputed_X_test = X_scale.transform(imputed_X_test)

## **matrix confusion for KNN**

In [10]:
from scipy.sparse.sputils import matrix
KNN_model = KNeighborsClassifier(n_neighbors= 27, p=2, metric='euclidean')
KNN_model.fit(imputed_X_train, y_train.values.ravel())
X_validation_KNN = KNN_model.predict(imputed_X_test)

In [11]:
cm = confusion_matrix(y_test, X_validation_KNN)
print(cm)

[[99  8]
 [24 23]]


### **f1 score and accuracy for logistic and KNN**

In [12]:
print(f"the f1 score for KNN is {f1_score(y_test, X_validation_KNN)}")
print(f"the accuracy for KNN is {accuracy_score(y_test, X_validation_KNN)} ")
print(f"the f1 score for logistic is {f1_score(y_test, X_validation_logistic)}")
print(f"the accuracy for logistic is {accuracy_score(y_test, X_validation_logistic)} ")

the f1 score for KNN is 0.5897435897435898
the accuracy for KNN is 0.7922077922077922 
the f1 score for logistic is 0.6823529411764706
the accuracy for logistic is 0.8246753246753247 
