In [None]:
#Question 1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


print(df.head())


print(df.info())


print(df.isnull().sum())


print(df.describe())


sns.set(style="whitegrid")


df.hist(bins=20, figsize=(20, 15))
plt.tight_layout()
plt.show()


corr_matrix = df.corr()


plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


sns.pairplot(df, hue='Outcome')
plt.show()





In [None]:
#Question 2

import pandas as pd


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


print(df.head())


print(df.isnull().sum())



def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


for col in ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']:
    df = remove_outliers(df, col)


print(df.describe())


print(df.isnull().sum())


print(df.describe())


In [None]:
#Question 3

import pandas as pd
from sklearn.model_selection import train_test_split


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


print(df.head())


X = df.drop(columns=['Outcome'])
y = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')


print('Training set:')
print(y_train.value_counts(normalize=True))
print('Test set:')
print(y_test.value_counts(normalize=True))


In [None]:
#Question 4


import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


print(df.head())


X = df.drop(columns=['Outcome'])
y = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')


clf = DecisionTreeClassifier(random_state=42)


param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_

print(f'Best Parameters: {best_params}')


y_pred = best_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


In [None]:
#Question 5

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


print(df.head())


X = df.drop(columns=['Outcome'])
y = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')


clf = DecisionTreeClassifier(random_state=42)


param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_

print(f'Best Parameters: {best_params}')


y_pred = best_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


y_prob = best_clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)


plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
#Question 6

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


X = df.drop(columns=['Outcome'])
y = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


clf = DecisionTreeClassifier(random_state=42)


clf.fit(X_train, y_train)




Interpretation of the Decision Tree
The decision tree visualization will display nodes representing splits based on the most informative features and their thresholds. Here are some key interpretations typically done:

Root Node: This is the top node of the tree, representing the entire dataset. It shows the initial split based on the feature that best separates the classes (0 for non-diabetic, 1 for diabetic).

Internal Nodes: These nodes represent decisions based on feature thresholds. For example, a node might split the data based on whether Glucose levels are above a certain threshold.

Leaf Nodes: These are the terminal nodes where the decision tree makes predictions. Each leaf node represents a class label (0 or 1 in our case).




In [None]:
#Question 7

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


url = 'https://drive.google.com/uc?export=download&id=1Q4J8KS1wm4-_YTuc389enPh6O-eTNcx2'
df = pd.read_csv(url)


X = df.drop(columns=['Outcome'])
y = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


best_clf = DecisionTreeClassifier(random_state=42)
best_clf.fit(X_train, y_train)


y_pred = best_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Validation Metrics:')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


