# Connect with Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_excel('/content/drive/My Drive/diabetes_dataset.xlsx', sheet_name='diabetes_prediction_dataset')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,Unnamed: 9
0,Female,80.0,0,1,never,25.19,6.6,140,0,0.085
1,Female,54.0,0,0,No Info,27.32,6.6,80,0,
2,Male,28.0,0,0,never,27.32,5.7,158,0,
3,Female,36.0,0,0,current,23.45,5.0,155,0,
4,Male,76.0,1,1,current,20.14,4.8,155,0,


# Process Categorical Columns

In [None]:
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)
# df = pd.get_dummies(df, columns=['gender', 'smoking_history'])
df.drop('Unnamed: 9', axis=1, inplace=True)
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,0,0,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,1,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,1,0,1,0,0,0,0


# Process Numerical Features

In [None]:
from sklearn.preprocessing import StandardScaler
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.692704,0,1,-0.321056,1.001706,0.047704,0,0,0,0,0,0,1,0
1,0.538006,0,0,-0.000116,1.001706,-1.42621,0,0,0,0,0,0,0,0
2,-0.616691,0,0,-0.000116,0.161108,0.489878,0,1,0,0,0,0,1,0
3,-0.261399,0,0,-0.583232,-0.49269,0.416183,0,0,0,1,0,0,0,0
4,1.515058,1,1,-1.08197,-0.67949,0.416183,0,1,0,1,0,0,0,0


In [None]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.692704,0,1,-0.321056,1.001706,0.047704,0,0,0,0,0,0,1,0
1,0.538006,0,0,-0.000116,1.001706,-1.42621,0,0,0,0,0,0,0,0
2,-0.616691,0,0,-0.000116,0.161108,0.489878,0,1,0,0,0,0,1,0
3,-0.261399,0,0,-0.583232,-0.49269,0.416183,0,0,0,1,0,0,0,0
4,1.515058,1,1,-1.08197,-0.67949,0.416183,0,1,0,1,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('diabetes', axis=1)
y = df['diabetes']
#Train/Test Split with the first 10,000 examples as the test set
test_size = 10000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)

In [None]:
print("Type ", type(X_train))

Type  <class 'pandas.core.frame.DataFrame'>


In [None]:
print(type(y_train))

<class 'pandas.core.series.Series'>


# Prepare the train/test split

In [None]:
test_set = df.iloc[:10000, :]
X_test, y_test = test_set.drop('diabetes', axis=1), test_set['diabetes']
training_set = df.iloc[10000:, :]
X_train, y_train = training_set.drop('diabetes', axis=1), training_set['diabetes']

In [None]:
print(len(X_test))
print(len(X_train))

10000
90000


In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, classification_report
y_pred = lr_model.predict(X_train)
accuracy_diabetes = accuracy_score(y_train[y_train == 1], y_pred[y_train == 1])
accuracy_non_diabetes = accuracy_score(y_train[y_train == 0], y_pred[y_train == 0])
accuracy              = accuracy_score(y_train, y_pred)
print(accuracy_diabetes)
print(accuracy_non_diabetes)
print("total accuracy ", accuracy)
from sklearn.metrics import confusion_matrix
# Calculate precision, recall, and f1-score separately for diabetes and non-diabetes classes
report_diabetes = classification_report(y_train, y_pred, labels=[1], target_names=['Diabetes'], output_dict=True)
report_non_diabetes = classification_report(y_train, y_pred, labels=[0], target_names=['Non-Diabetes'], output_dict=True)
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred)
print("Confusion matrix is ", conf_matrix)
print(report_diabetes)
print(report_non_diabetes)

0.6280591545609213
0.9910028047936473
total accuracy  0.9601888888888889
Confusion matrix is  [[81618   741]
 [ 2842  4799]]
{'Diabetes': {'precision': 0.8662454873646209, 'recall': 0.6280591545609213, 'f1-score': 0.7281693346483574, 'support': 7641}, 'micro avg': {'precision': 0.8662454873646209, 'recall': 0.6280591545609213, 'f1-score': 0.7281693346483574, 'support': 7641}, 'macro avg': {'precision': 0.8662454873646209, 'recall': 0.6280591545609213, 'f1-score': 0.7281693346483574, 'support': 7641}, 'weighted avg': {'precision': 0.8662454873646209, 'recall': 0.6280591545609213, 'f1-score': 0.7281693346483574, 'support': 7641}}
{'Non-Diabetes': {'precision': 0.9663509353540137, 'recall': 0.9910028047936473, 'f1-score': 0.9785216312290567, 'support': 82359}, 'micro avg': {'precision': 0.9663509353540137, 'recall': 0.9910028047936473, 'f1-score': 0.9785216312290567, 'support': 82359}, 'macro avg': {'precision': 0.9663509353540137, 'recall': 0.9910028047936473, 'f1-score': 0.9785216312290

# Test Set evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = lr_model.predict(X_test)
accuracy_diabetes = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
accuracy_non_diabetes = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
accuracy              = accuracy_score(y_test, y_pred)
print(accuracy_diabetes)
print(accuracy_non_diabetes)
print("total accuracy ", accuracy)
from sklearn.metrics import confusion_matrix
# Calculate precision, recall, and f1-score separately for diabetes and non-diabetes classes
report_diabetes = classification_report(y_test, y_pred, labels=[1], target_names=['Diabetes'], output_dict=True)
report_non_diabetes = classification_report(y_test, y_pred, labels=[0], target_names=['Non-Diabetes'], output_dict=True)
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix is ", conf_matrix)
print(report_diabetes)
print(report_non_diabetes)

0.6426076833527358
0.9917952084017067
total accuracy  0.9618
Confusion matrix is  [[9066   75]
 [ 307  552]]
{'Diabetes': {'precision': 0.8803827751196173, 'recall': 0.6426076833527358, 'f1-score': 0.7429340511440108, 'support': 859}, 'micro avg': {'precision': 0.8803827751196173, 'recall': 0.6426076833527358, 'f1-score': 0.7429340511440108, 'support': 859}, 'macro avg': {'precision': 0.8803827751196173, 'recall': 0.6426076833527358, 'f1-score': 0.7429340511440108, 'support': 859}, 'weighted avg': {'precision': 0.8803827751196173, 'recall': 0.6426076833527358, 'f1-score': 0.7429340511440107, 'support': 859}}
{'Non-Diabetes': {'precision': 0.9672463458871225, 'recall': 0.9917952084017067, 'f1-score': 0.9793669655395918, 'support': 9141}, 'micro avg': {'precision': 0.9672463458871225, 'recall': 0.9917952084017067, 'f1-score': 0.9793669655395918, 'support': 9141}, 'macro avg': {'precision': 0.9672463458871225, 'recall': 0.9917952084017067, 'f1-score': 0.9793669655395918, 'support': 9141},

# Use Kmeans clustering

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)

from sklearn.decomposition import PCA
pca = PCA(n_components=3)  # Choose an appropriate number of components
X_train_pca = pd.DataFrame(pca.fit_transform(X_train))
X_test_pca = pd.DataFrame(pca.transform(X_test))
kmeans.fit(X_train_pca)
from sklearn.metrics import accuracy_score, classification_report
y_pred = kmeans.predict(X_train_pca)
accuracy_diabetes = accuracy_score(y_train[y_train == 1], y_pred[y_train == 1])
accuracy_non_diabetes = accuracy_score(y_train[y_train == 0], y_pred[y_train == 0])
accuracy              = accuracy_score(y_train, y_pred)
print(accuracy_diabetes)
print(accuracy_non_diabetes)
print("total accuracy ", accuracy)
from sklearn.metrics import confusion_matrix
# Calculate precision, recall, and f1-score separately for diabetes and non-diabetes classes
report_diabetes = classification_report(y_train, y_pred, labels=[1], target_names=['Diabetes'], output_dict=True)
report_non_diabetes = classification_report(y_train, y_pred, labels=[0], target_names=['Non-Diabetes'], output_dict=True)
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred)
print("Confusion matrix is ", conf_matrix)
print(report_diabetes)
print(report_non_diabetes)



0.9789294594948306
0.5023373280394371
total accuracy  0.5428
Confusion matrix is  [[41372 40987]
 [  161  7480]]
{'Diabetes': {'precision': 0.15433181339880744, 'recall': 0.9789294594948306, 'f1-score': 0.26662864475654097, 'support': 7641}, 'micro avg': {'precision': 0.15433181339880744, 'recall': 0.9789294594948306, 'f1-score': 0.26662864475654097, 'support': 7641}, 'macro avg': {'precision': 0.15433181339880744, 'recall': 0.9789294594948306, 'f1-score': 0.26662864475654097, 'support': 7641}, 'weighted avg': {'precision': 0.15433181339880744, 'recall': 0.9789294594948306, 'f1-score': 0.26662864475654097, 'support': 7641}}
{'Non-Diabetes': {'precision': 0.9961235643945778, 'recall': 0.5023373280394371, 'f1-score': 0.6678720175636845, 'support': 82359}, 'micro avg': {'precision': 0.9961235643945778, 'recall': 0.5023373280394371, 'f1-score': 0.6678720175636845, 'support': 82359}, 'macro avg': {'precision': 0.9961235643945778, 'recall': 0.5023373280394371, 'f1-score': 0.6678720175636845,

# Kmeans clustering on test set

In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = kmeans.predict(X_test_pca)
accuracy_diabetes = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
accuracy_non_diabetes = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
accuracy              = accuracy_score(y_test, y_pred)
print(accuracy_diabetes)
print(accuracy_non_diabetes)
print("total accuracy ", accuracy)
from sklearn.metrics import confusion_matrix
# Calculate precision, recall, and f1-score separately for diabetes and non-diabetes classes
report_diabetes = classification_report(y_test, y_pred, labels=[1], target_names=['Diabetes'], output_dict=True)
report_non_diabetes = classification_report(y_test, y_pred, labels=[0], target_names=['Non-Diabetes'], output_dict=True)
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix is ", conf_matrix)
print(report_diabetes)
print(report_non_diabetes)

0.9778812572759022
0.5012580680450717
total accuracy  0.5422
Confusion matrix is  [[4582 4559]
 [  19  840]]
{'Diabetes': {'precision': 0.15558436747545842, 'recall': 0.9778812572759022, 'f1-score': 0.2684563758389262, 'support': 859}, 'micro avg': {'precision': 0.15558436747545842, 'recall': 0.9778812572759022, 'f1-score': 0.2684563758389262, 'support': 859}, 'macro avg': {'precision': 0.15558436747545842, 'recall': 0.9778812572759022, 'f1-score': 0.2684563758389262, 'support': 859}, 'weighted avg': {'precision': 0.15558436747545842, 'recall': 0.9778812572759022, 'f1-score': 0.2684563758389262, 'support': 859}}
{'Non-Diabetes': {'precision': 0.9958704629428385, 'recall': 0.5012580680450717, 'f1-score': 0.6668607189637609, 'support': 9141}, 'micro avg': {'precision': 0.9958704629428385, 'recall': 0.5012580680450717, 'f1-score': 0.6668607189637609, 'support': 9141}, 'macro avg': {'precision': 0.9958704629428385, 'recall': 0.5012580680450717, 'f1-score': 0.6668607189637609, 'support': 91

In [None]:
training_set.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
10000,-0.616691,0,0,0.420271,0.908306,1.521618,0,0,0,0,0,1,0,0
10001,-0.971982,0,0,-0.000116,-0.49269,0.514444,0,1,0,0,0,0,0,0
10002,-0.216988,0,0,2.252493,1.001706,-1.180558,0,0,0,0,0,0,0,1
10003,0.982121,0,0,0.51821,0.534707,0.489878,0,0,0,0,0,0,1,0
10004,-1.726976,0,0,-1.173883,0.628107,0.047704,0,0,0,0,0,0,0,0


In [None]:
# No randomization

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assume 'df' is your original DataFrame
# 1. Handle Categorical Variables
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)

# 2. Handle Missing Values (replace NaN with mean in this example)
df.fillna(df.mean(), inplace=True)


# 4. Prepare Target Variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# 5. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import pandas as pd
df = pd.read_excel('/content/drive/My Drive/upwork/diabetes_prediction_dataset.xlsx', sheet_name='diabetes_prediction_dataset')
print("Total number of records in dataset ", len(df))
df = pd.read_excel('/content/drive/My Drive/upwork/diabetes_prediction_dataset.xlsx', sheet_name='diabetes_numbers')
print("Total number of records in dataset diabetes_numbers ", len(df))
df = pd.read_excel('/content/drive/My Drive/upwork/diabetes_prediction_dataset.xlsx', sheet_name='diabetes_clean')
print("Total number of records in dataset diabetes_clean ", len(df))
df = pd.read_excel('/content/drive/My Drive/upwork/diabetes_prediction_dataset.xlsx', sheet_name='test_data(logistic regression)')
print("Total number of records in dataset test_data(logistic regression)", len(df))
df = pd.read_excel('/content/drive/My Drive/upwork/diabetes_prediction_dataset.xlsx', sheet_name='train_data(logistic regression)')
print("Total number of records in dataset  train_data(logistic regression)", len(df))


Total number of records in dataset  100000
Total number of records in dataset diabetes_numbers  100000
Total number of records in dataset diabetes_clean  100000
Total number of records in dataset test_data(logistic regression) 10002
Total number of records in dataset  train_data(logistic regression) 90002


In [None]:
df1.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,Unnamed: 9,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,0.085,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,,0,0,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,,1,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,,1,0,1,0,0,0,0


In [None]:
df2.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,Unnamed: 9,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,0.085,1,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,,1,0,0,1,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,,0,1,0,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,,1,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,,0,1,0,0,1,0,0,0,0


In [None]:
from sklearn.preprocessing import StandardScaler


# 2. Handle Missing Values (replace NaN with mean in this example)
df.fillna(df.mean(), inplace=True)

# 3. Scale Numerical Features
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# 4. Prepare Target Variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# 5. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes', 'Unnamed: 9'],
      dtype='object')

In [None]:
# prompt: I have column gender with values male/female/other. convert it for logistic regression

df['gender'].replace({'male': 0, 'female': 1, 'Other': 2}, inplace=True)


In [None]:
# prompt: read with pandas dataframe a particular sheet name

df = pd.read_excel('/content/drive/My Drive/diabetes_dataset.xlsx', sheet_name='Sheet1')


In [None]:
df.columns

Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes', 'gender_Male', 'gender_Other',
       'smoking_history_current', 'smoking_history_ever',
       'smoking_history_former', 'smoking_history_never',
       'smoking_history_not current'],
      dtype='object')