In [1]:

import numpy as np                                                 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
# Uncomment the following if you want to use cross-validation
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

In [10]:

# Load dataset from CSV file
df = pd.read_csv("C:/Users/ASUS/Downloads/diabetes.csv")

In [11]:
# Exploratory Data Analysis
print("First 5 rows of the dataset:\n", df.head())
df.tail(10)
df.sample(10)
df.shape
df.dtypes
df.info()
print("\nDataset statistics:\n", df.describe())

First 5 rows of the dataset:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1  

In [12]:

# Data Cleaning
df.drop_duplicates()
df.isnull().sum()
df.isna().sum()
     

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [13]:

# Check number of zeroes in a specific column (if applicable)
# For example:
print("No of zero values in Glucose:", df[df['Glucose'] == 0].shape[0])

# Replace zero values with mean in specific columns (if necessary)
# df['Glucose'].replace(0, df['Glucose'].mean(), inplace=True)

# Encode categorical variables (if any, adjust as needed)
# Example: df['Gender'] = label_encoder.fit_transform(df['Gender'])

No of zero values in Glucose: 5


In [14]:

# Separate target variable and features
target_name = 'Outcome'  # Adjust target variable name as per dataset
target = df[target_name]
data = df.drop(columns=[target_name])  # Drop target variable

In [15]:
# Scale numerical features (optional, depending on the dataset)
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [16]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [17]:

# Implement Decision Tree algorithm
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [18]:
# Making Predictions
dt_pred = dt.predict(X_test)
dt_pred.shape

(154,)

In [19]:

# Model evaluation
print("Model Training Complete")
print("Train set accuracy:", dt.score(X_train, y_train))
print("Test set accuracy:", dt.score(X_test, y_test))

Model Training Complete
Train set accuracy: 1.0
Test set accuracy: 0.7467532467532467


In [20]:

# Confusion matrix
conf_matrix = confusion_matrix(y_test, dt_pred)
print("Confusion Matrix:\n", conf_matrix)
report = classification_report(y_test, dt_pred)
print("Classification Report:\n", report)

Confusion Matrix:
 [[75 24]
 [15 40]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.62      0.73      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [21]:
# Testing on a random sample from test data
random_sample = X_test.sample(1, random_state=42)
random_sample_prediction = dt.predict(random_sample)
print("Random sample prediction (1 means diabetic, 0 means not):", random_sample_prediction)

Random sample prediction (1 means diabetic, 0 means not): [0]
