Q1. Import the dataset and examine the variables. Use descriptive statistics and visualizations to
understand the distribution and relationships between the variables. you  have a diabetes data 



In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the diabetes dataset (replace 'your_dataset.csv' with the actual file path)
diabetes_data = pd.read_csv('your_dataset.csv')

# Display basic information about the dataset
print(diabetes_data.info())

# Display descriptive statistics
print(diabetes_data.describe())

# Visualize the distribution of variables using histograms
diabetes_data.hist(figsize=(10, 10))
plt.show()

# Visualize relationships between variables using pair plots
sns.pairplot(diabetes_data)
plt.show()

# Visualize correlations between variables using a heatmap
correlation_matrix = diabetes_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()


Q2. Preprocess the data by cleaning missing values, removing outliers, and transforming categorical
variables into dummy variables if necessary.



In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the diabetes dataset (replace 'your_dataset.csv' with the actual file path)
diabetes_data = pd.read_csv('your_dataset.csv')

# Display missing values
print("Missing values before preprocessing:")
print(diabetes_data.isnull().sum())

# Handle missing values (replace NaN with mean, median, or other strategies)
diabetes_data.fillna(diabetes_data.mean(), inplace=True)

# Display outliers using box plots (replace 'feature_name' with the actual column name)
plt.figure(figsize=(10, 6))
sns.boxplot(x=diabetes_data['feature_name'])
plt.title('Box Plot for Outliers')
plt.show()

# Remove outliers using the IQR (Interquartile Range) method
Q1 = diabetes_data['feature_name'].quantile(0.25)
Q3 = diabetes_data['feature_name'].quantile(0.75)
IQR = Q3 - Q1
diabetes_data = diabetes_data[(diabetes_data['feature_name'] >= Q1 - 1.5 * IQR) & (diabetes_data['feature_name'] <= Q3 + 1.5 * IQR)]

# Display missing values after preprocessing
print("Missing values after preprocessing:")
print(diabetes_data.isnull().sum())

# Transform categorical variables into dummy variables (if necessary)
# For example, if 'gender' is a categorical variable
diabetes_data = pd.get_dummies(diabetes_data, columns=['gender'], drop_first=True)

# Display the preprocessed dataset
print("Preprocessed dataset:")
print(diabetes_data.head())


Q3. Split the dataset into a training set and a test set. Use a random seed to ensure reproducibility.


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split

# Assuming 'diabetes_data' is your preprocessed DataFrame with features and target variable
# Replace 'target_column' with the actual column name of your target variable
X = diabetes_data.drop('target_column', axis=1)
y = diabetes_data['target_column']

# Set a random seed for reproducibility
random_seed = 42

# Split the dataset into training and test sets (typically, 80% training and 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Display the shapes of the training and test sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Q4. Use a decision tree algorithm, such as ID3 or C4.5, to train a decision tree model on the training set. Use
cross-validation to optimize the hyperparameters and avoid overfitting.



In [None]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Create a decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the hyperparameter grid for optimization
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter optimization and cross-validation
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train a decision tree model with the best hyperparameters on the entire training set
best_dt_model = DecisionTreeClassifier(**best_params, random_state=42)
best_dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_dt_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print("Best Hyperparameters:", best_params)
print("Accuracy on Test Set:", accuracy)
print("Classification Report:\n", classification_rep)


Q5. Evaluate the performance of the decision tree model on the test set using metrics such as accuracy,
precision, recall, and F1 score. Use confusion matrices and ROC curves to visualize the results.

In [None]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

# Predict on the test set
y_pred = best_dt_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Calculate and plot the ROC curve
y_proba = best_dt_model.predict_proba(X_test)[:, 1]  # Probability of positive class
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()


Q6. Interpret the decision tree by examining the splits, branches, and leaves. Identify the most important
variables and their thresholds. Use domain knowledge and common sense to explain the patterns and
trends.

In [None]:
# Import necessary libraries
from sklearn.tree import plot_tree

# Visualize the decision tree
plt.figure(figsize=(15, 10))
plot_tree(best_dt_model, feature_names=X.columns, class_names=["Negative", "Positive"], filled=True, rounded=True)
plt.show()


Q7. Validate the decision tree model by applying it to new data or testing its robustness to changes in the
dataset or the environment. Use sensitivity analysis and scenario testing to explore the uncertainty and
risks.

In [None]:
# Assuming 'new_data' is your new dataset
new_data = pd.read_csv('new_data.csv')  # Replace with the actual file path
X_new = new_data.drop('target_column', axis=1)
y_new = new_data['target_column']

# Predict on new data
y_pred_new = best_dt_model.predict(X_new)

# Evaluate the model on new data
accuracy_new = accuracy_score(y_new, y_pred_new)
precision_new = precision_score(y_new, y_pred_new)
recall_new = recall_score(y_new, y_pred_new)
f1_new = f1_score(y_new, y_pred_new)

# Display metrics for new data
print("Validation on New Data:")
print("Accuracy:", accuracy_new)
print("Precision:", precision_new)
print("Recall:", recall_new)
print("F1 Score:", f1_new)


In [None]:
# Example sensitivity analysis for a specific feature ('feature_name')
feature_to_test = 'feature_name'
original_values = X_test[feature_to_test].copy()

# Perturb the feature and observe the impact on predictions
for perturbation in [0.1, 0.5, 1.0]:
    X_test[feature_to_test] = original_values * perturbation
    y_pred_perturbed = best_dt_model.predict(X_test)
    accuracy_perturbed = accuracy_score(y_test, y_pred_perturbed)
    print(f"Perturbation factor: {perturbation}, Accuracy: {accuracy_perturbed}")
