In [None]:
You are a data scientist working for a healthcare company, and you have been tasked with creating a
decision tree to help identify patients with diabetes based on a set of clinical variables. You have been
given a dataset (diabetes.csv) with the following variables:
1. Pregnancies: Number of times pregnant (integer)

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Display the first few rows of the dataset
print(data.head())

import seaborn as sns
import matplotlib.pyplot as plt

# Check for missing values
print(data.isnull().sum())

# Visualize the distribution of the target variable
sns.countplot(x='Outcome', data=data)
plt.title('Distribution of Diabetes Cases')
plt.show()

# Pairplot to see relationships between features
sns.pairplot(data, hue='Outcome')
plt.show()

# Define features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12,8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()

In [None]:
2. Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test (integer)

In [None]:
# Summary statistics for the dataset
print(data.describe())

# Visualize the distribution of the Glucose variable
sns.histplot(data['Glucose'], bins=30, kde=True)
plt.title('Distribution of Plasma Glucose Concentration')
plt.xlabel('Glucose Level')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the Glucose column
print(data['Glucose'].isnull().sum())

# Impute missing values (if any) with the mean or median
data['Glucose'].fillna(data['Glucose'].mean(), inplace=True)

# Define features and target variable (including Glucose)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12,8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()



In [None]:
3. BloodPressure: Diastolic blood pressure (mm Hg) (integer)

In [None]:
# Check the summary statistics for BloodPressure
print(data['BloodPressure'].describe())

# Visualize the distribution of the BloodPressure variable
sns.histplot(data['BloodPressure'], bins=30, kde=True)
plt.title('Distribution of Diastolic Blood Pressure')
plt.xlabel('Blood Pressure (mm Hg)')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the BloodPressure column
print(data['BloodPressure'].isnull().sum())

# Impute missing values (if any) with the mean or median
data['BloodPressure'].fillna(data['BloodPressure'].median(), inplace=True)

# Define features and target variable (including BloodPressure)
X = data.drop('Outcome', axis=1)  # Ensure all relevant features are included
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()



In [None]:
4. SkinThickness: Triceps skin fold thickness (mm) (integer)

In [None]:
# Check the summary statistics for SkinThickness
print(data['SkinThickness'].describe())

# Visualize the distribution of the SkinThickness variable
sns.histplot(data['SkinThickness'], bins=30, kde=True)
plt.title('Distribution of Triceps Skin Fold Thickness')
plt.xlabel('Skin Thickness (mm)')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the SkinThickness column
print(data['SkinThickness'].isnull().sum())

# Impute missing values (if any) with the median
data['SkinThickness'].fillna(data['SkinThickness'].median(), inplace=True)

# Define features and target variable (including SkinThickness)
X = data.drop('Outcome', axis=1)  # Ensure all relevant features are included
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()



In [None]:
5. Insulin: 2-Hour serum insulin (mu U/ml) (integer)

In [None]:
# Check the summary statistics for Insulin
print(data['Insulin'].describe())

# Visualize the distribution of the Insulin variable
sns.histplot(data['Insulin'], bins=30, kde=True)
plt.title('Distribution of 2-Hour Serum Insulin Levels')
plt.xlabel('Insulin (µU/ml)')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the Insulin column
print(data['Insulin'].isnull().sum())

# Impute missing values (if any) with the median
data['Insulin'].fillna(data['Insulin'].median(), inplace=True)

# Define features and target variable (including Insulin)
X = data.drop('Outcome', axis=1)  # Ensure all relevant features are included
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()


In [None]:
6. BMI: Body mass index (weight in kg/(height in m)^2) (float)

In [None]:
# Check the summary statistics for BMI
print(data['BMI'].describe())

# Visualize the distribution of the BMI variable
sns.histplot(data['BMI'], bins=30, kde=True)
plt.title('Distribution of Body Mass Index (BMI)')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the BMI column
print(data['BMI'].isnull().sum())

# Impute missing values (if any) with the median
data['BMI'].fillna(data['BMI'].median(), inplace=True)

# Define features and target variable (including BMI)
X = data.drop('Outcome', axis=1)  # Ensure all relevant features are included
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()

In [None]:
7. DiabetesPedigreeFunction: Diabetes pedigree function (a function which scores likelihood of diabetes
based on family history) (float)

In [None]:
# Check the summary statistics for Diabetes Pedigree Function
print(data['DiabetesPedigreeFunction'].describe())

# Visualize the distribution of the Diabetes Pedigree Function variable
sns.histplot(data['DiabetesPedigreeFunction'], bins=30, kde=True)
plt.title('Distribution of Diabetes Pedigree Function')
plt.xlabel('Diabetes Pedigree Function Score')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the DiabetesPedigreeFunction column
print(data['DiabetesPedigreeFunction'].isnull().sum())

# Impute missing values (if any) with the mean or median
data['DiabetesPedigreeFunction'].fillna(data['DiabetesPedigreeFunction'].mean(), inplace=True)

# Define features and target variable (including Diabetes Pedigree Function)
X = data.drop('Outcome', axis=1)  # Ensure all relevant features are included
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()



In [None]:
8. Age: Age in years (integer)

In [None]:
# Check the summary statistics for Age
print(data['Age'].describe())

# Visualize the distribution of the Age variable
sns.histplot(data['Age'], bins=30, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age (years)')
plt.ylabel('Frequency')
plt.show()

# Check for missing values in the Age column
print(data['Age'].isnull().sum())

# Impute missing values (if any) with the median
data['Age'].fillna(data['Age'].median(), inplace=True)

# Define features and target variable (including Age)
X = data.drop('Outcome', axis=1)  # Ensure all relevant features are included
y = data['Outcome']

# (Optional) Scale the features if necessary
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(12, 8))
plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'])
plt.title('Decision Tree for Diabetes Classification')
plt.show()



In [None]:
9. Outcome: Class variable (0 if non-diabetic, 1 if diabetic) (integer)

In [None]:
# Check the distribution of the Outcome variable
print(data['Outcome'].value_counts())

# Visualize the distribution of the Outcome variable
sns.countplot(x='Outcome', data=data)
plt.title('Distribution of Diabetes Outcomes')
plt.xlabel('Outcome (0: Non-diabetic, 1: Diabetic)')
plt.ylabel('Count')
plt.show()

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # All features except Outcome
y = data['Outcome']  # Outcome as the target variable

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
Your goal is to create a decision tree to predict whether a patient has diabetes based on the other
variables. Here are the steps you can follow:
Q1. Import the dataset and examine the variables. Use descriptive statistics and visualizations to
understand the distribution and relationships between the variables.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Display the first few rows of the dataset
print(data.head())
# Get information about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())
# Get summary statistics for numerical features
print(data.describe())
# Set the style for the plots
sns.set(style="whitegrid")

# Plot distributions of features
features = data.columns[:-1]  # All columns except 'Outcome'

plt.figure(figsize=(15, 12))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    sns.histplot(data[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

# Visualize the relationship between each feature and the outcome
plt.figure(figsize=(15, 12))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x='Outcome', y=feature, data=data)
    plt.title(f'{feature} vs Outcome')
plt.tight_layout()
plt.show()
# Compute the correlation matrix
correlation_matrix = data.corr()

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [None]:
Q2. Preprocess the data by cleaning missing values, removing outliers, and transforming categorical
variables into dummy variables if necessary.

In [None]:
# Check for missing values
print(data.isnull().sum())

# Impute missing values with the median for numerical features
data.fillna(data.median(), inplace=True)
# Function to remove outliers based on IQR
def remove_outliers(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers from the dataset
data = remove_outliers(data)

# Check the shape of the data after outlier removal
print(data.shape)
# Example: If you had a categorical variable 'Gender'
# data = pd.get_dummies(data, columns=['Gender'], drop_first=True)

# Since our dataset doesn't contain categorical variables, this step can be skipped.
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X = data.drop('Outcome', axis=1)  # Features
y = data['Outcome']  # Target variable

X_scaled = scaler.fit_transform(X)


In [None]:
Q3. Split the dataset into a training set and a test set. Use a random seed to ensure reproducibility.

In [None]:
from sklearn.model_selection import train_test_split
# Assuming you have your features in X and target in y
X = data.drop('Outcome', axis=1)  # Features
y = data['Outcome']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and test sets
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")


In [None]:
Q4. Use a decision tree algorithm, such as ID3 or C4.5, to train a decision tree model on the training set. Use
cross-validation to optimize the hyperparameters and avoid overfitting.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dt_classifier,
                           param_grid=param_grid,
                           scoring='accuracy',  # You can choose other metrics as needed
                           cv=5,                # 5-fold cross-validation
                           n_jobs=-1,          # Use all available cores
                           verbose=1)          # Show progress

# Fit the model to the training data
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validated score: {best_score:.4f}")
# Initialize the Decision Tree Classifier with the best parameters
best_dt_classifier = DecisionTreeClassifier(**best_params, random_state=42)

# Fit the final model to the training data
best_dt_classifier.fit(X_train, y_train)


In [None]:
Q5. Evaluate the performance of the decision tree model on the test set using metrics such as accuracy,
precision, recall, and F1 score. Use confusion matrices and ROC curves to visualize the results.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Make predictions on the test set
y_pred = best_dt_classifier.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Print classification report
print(classification_report(y_test, y_pred))
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-diabetic (0)', 'Diabetic (1)'],
            yticklabels=['Non-diabetic (0)', 'Diabetic (1)'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Calculate probabilities for the positive class
y_prob = best_dt_classifier.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
Q6. Interpret the decision tree by examining the splits, branches, and leaves. Identify the most important
variables and their thresholds. Use domain knowledge and common sense to explain the patterns and
trends.

In [None]:
from sklearn.tree import plot_tree

# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(best_dt_classifier, filled=True, feature_names=X.columns, class_names=['Non-diabetic (0)', 'Diabetic (1)'], rounded=True)
plt.title('Decision Tree for Diabetes Classification')
plt.show()
# Get feature importances
importances = best_dt_classifier.feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances for Diabetes Classification')
plt.show()


In [None]:
Q7. Validate the decision tree model by applying it to new data or testing its robustness to changes in the
dataset or the environment. Use sensitivity analysis and scenario testing to explore the uncertainty and
risks.

In [None]:
# Assuming new_data is a DataFrame containing the same features as the training data
new_data_predictions = best_dt_classifier.predict(new_data)

# Evaluate performance metrics on new data
new_accuracy = accuracy_score(new_data['Outcome'], new_data_predictions)
print(f'New Data Accuracy: {new_accuracy:.4f}')
# Add random noise to the features
noise = np.random.normal(0, 0.1, X_test.shape)  # Adjust the standard deviation as needed
X_test_noisy = X_test + noise

# Make predictions on the noisy data
noisy_predictions = best_dt_classifier.predict(X_test_noisy

# Evaluate the performance on noisy data
noisy_accuracy = accuracy_score(y_test, noisy_predictions)
print(f'Accuracy with Noise: {noisy_accuracy:.4f}')
# Example scenario testing: Create hypothetical patients
scenarios = pd.DataFrame({
    'Pregnancies': [0, 3, 5],
    'Glucose': [70, 150, 200],
    'BloodPressure': [60, 80, 90],
    'SkinThickness': [10, 20, 30],
    'Insulin': [0, 100, 200],
    'BMI': [20.0, 30.0, 35.0],
    'DiabetesPedigreeFunction': [0.5, 1.0, 1.5],
    'Age': [25, 40, 60]
})

# Make predictions for these scenarios
scenario_predictions = best_dt_classifier.predict(scenarios)

# Print predictions for each scenario
for i, pred in enumerate(scenario_predictions):
    print(f'Scenario {i + 1}: Predicted Outcome = {pred}')
# Function to assess sensitivity of predictions
def sensitivity_analysis(feature, values):
    results = []
    for value in values:
        temp_data = X_test.copy()
        temp_data[feature] = value  # Change only the specified feature
        pred = best_dt_classifier.predict(temp_data)
        results.append(pred)
    return results

# Example: Sensitivity of predictions to changes in Glucose
glucose_values = np.arange(50, 300, 10)  # Simulate glucose levels from 50 to 300
sensitivity_results = sensitivity_analysis('Glucose', glucose_values)

# Analyze results
for value, preds in zip(glucose_values, sensitivity_results):
    print(f'Glucose = {value}: Predicted Outcomes = {preds}')
