Diabetes Prediction

In [1]:
# Step 1: Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 2: Load the Dataset
file_path = "/content/diabetes_prediction_dataset.csv"
data = pd.read_csv(file_path)

In [3]:
# Step 3: Explore the Dataset
print("Dataset Info:\n", data.info())
print("\nFirst 5 Rows:\n", data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
Dataset Info:
 None

First 5 Rows:
    gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0    

In [6]:
# Step 4: Preprocess the Data
# Features and target
X = data.drop(columns=['diabetes'])  # Features (exclude 'diabetes' column as target)
y = data['diabetes']  # Target variable

# Optional: Check for missing values (none in this dataset)
print("\nMissing Values:\n", data.isnull().sum())



Missing Values:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [10]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Step 5: Encode categorical features (e.g., 'gender' and 'smoking_history')
label_encoder = LabelEncoder()

# Encode 'gender' and 'smoking_history' as numerical values
data['gender'] = label_encoder.fit_transform(data['gender'])  # Male: 1, Female: 0
data['smoking_history'] = label_encoder.fit_transform(data['smoking_history'].astype(str))

# Features and target
X = data.drop(columns=['diabetes'])  # Features
y = data['diabetes']  # Target variable


In [9]:
# Step 6: Scale the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Scaled Features:\n", X_scaled[:5])  # Print the first 5 scaled features for verification


Scaled Features:
 [[-8.41046744e-01  1.69270354e+00 -2.84439447e-01  4.93637859e+00
   9.63326710e-01 -3.21055792e-01  1.00170572e+00  4.77042159e-02]
 [-8.41046744e-01  5.38006427e-01 -2.84439447e-01 -2.02577655e-01
  -1.15346777e+00 -1.15583678e-04  1.00170572e+00 -1.42620999e+00]
 [ 1.18723364e+00 -6.16690686e-01 -2.84439447e-01 -2.02577655e-01
   9.63326710e-01 -1.15583678e-04  1.61108022e-01  4.89878478e-01]
 [-8.41046744e-01 -2.61399267e-01 -2.84439447e-01 -2.02577655e-01
  -6.24269153e-01 -5.83232300e-01 -4.92690191e-01  4.16182767e-01]
 [ 1.18723364e+00  1.51505783e+00  3.51568677e+00  4.93637859e+00
  -6.24269153e-01 -1.08197037e+00 -6.79489680e-01  4.16182767e-01]]


In [11]:
# Step 7: Split the Data into Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [12]:
# Step 8: Train the Model (Logistic Regression)
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence issues
classifier.fit(X_train, y_train)


In [13]:
# Step 9: Make Predictions
y_pred = classifier.predict(X_test)

In [14]:
# Step 10: Evaluate the Model
from sklearn.metrics import accuracy_score, classification_report

print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy Score: 0.9589

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     27453
           1       0.86      0.61      0.72      2547

    accuracy                           0.96     30000
   macro avg       0.91      0.80      0.85     30000
weighted avg       0.96      0.96      0.96     30000

