# 1.Data Preparation:

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset from a CSV file
df = pd.read_excel(r"C:\Users\sai\OneDrive\Documents\heart_disease.xlsx")

# View the first few rows of the dataset
print(df.head())

# Check the shape of the dataset (number of rows and columns)
print(df.shape)

# Check for missing values in the dataset
print(df.isnull().sum())

# Get information about the dataset (data types, counts, means, etc.)
print(df.info())
print(df.describe())

        age                                       Age in years
0    Gender                       Gender ; Male - 1, Female -0
1        cp                                    Chest pain type
2  trestbps                             Resting blood pressure
3      chol                                cholesterol measure
4       fbs  (fasting blood sugar > 120 mg/dl) (1 = true; 0...
(12, 2)
age             0
Age in years    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           12 non-null     object
 1   Age in years  12 non-null     object
dtypes: object(2)
memory usage: 324.0+ bytes
None
           age                  Age in years
count       12                            12
unique      12                            12
top     Gender  Gender ; Male - 1, Female -0
freq         1                             1


# 2.Exploratory Data Analysis (EDA)

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

# Summary statistics
print(df.describe())

# Check if there are any numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns

if len(numerical_columns) > 0:
    # Histograms for numerical features
    df[numerical_columns].hist(bins=20, figsize=(10,10))
    plt.show()

    # Box plots to visualize outliers
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=df[numerical_columns])
    plt.xticks(rotation=90)
    plt.show()

    # Correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(df[numerical_columns].corr(), annot=True, cmap='coolwarm')
    plt.show()
else:
    print("No numerical columns found in the DataFrame.")


           age                  Age in years
count       12                            12
unique      12                            12
top     Gender  Gender ; Male - 1, Female -0
freq         1                             1
No numerical columns found in the DataFrame.


# Task 3: Feature Engineering

In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Display categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)

# Handle missing values in categorical columns by filling with the mode
for column in categorical_columns:
    mode_value = df[column].mode()[0]
    df[column].fillna(mode_value, inplace=True)

# Apply One-Hot Encoding to categorical columns
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)  # Updated parameter name
df_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[categorical_columns]), columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Drop original categorical columns
df = df.drop(columns=categorical_columns)

# Concatenate the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Check for missing values and handle them (e.g., impute with mean/median)
df.fillna(df.median(), inplace=True)

print("Dataframe after encoding and missing value handling:")
print(df.head())


Categorical columns: Index(['age', 'Age in years'], dtype='object')
Dataframe after encoding and missing value handling:
   age_chol  age_cp  age_exang  age_fbs  age_num  age_oldpeak  age_restecg  \
0       0.0     0.0        0.0      0.0      0.0          0.0          0.0   
1       0.0     1.0        0.0      0.0      0.0          0.0          0.0   
2       0.0     0.0        0.0      0.0      0.0          0.0          0.0   
3       1.0     0.0        0.0      0.0      0.0          0.0          0.0   
4       0.0     0.0        0.0      1.0      0.0          0.0          0.0   

   age_slope  age_thal  age_thalch  ...  \
0        0.0       0.0         0.0  ...   
1        0.0       0.0         0.0  ...   
2        0.0       0.0         0.0  ...   
3        0.0       0.0         0.0  ...   
4        0.0       0.0         0.0  ...   

   Age in years_Gender ; Male - 1, Female -0  \
0                                        1.0   
1                                        0.0   
2      

# Task 4: Decision Tree Classification


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = r"C:\Users\sai\OneDrive\Documents\heart_disease.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows and columns of the dataset
print("First few rows of the dataset:")
print(df.head())
print("Columns in DataFrame:", df.columns)

# Define the target column (replace with the actual target column name)
target_column = df.columns[-1]  # Assuming the last column is the target column

# Split the dataset into features (X) and target (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Convert categorical data into numerical data
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)
y_score = model.predict_proba(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# ROC-AUC score requires probability estimates
try:
    if len(y.unique()) == 2:  # Binary classification
        roc_auc = roc_auc_score(y_test, y_score[:, 1])
    else:  # Multiclass classification
        roc_auc = roc_auc_score(y_test, y_score, multi_class='ovr')
except ValueError as e:
    print(f"Error calculating ROC-AUC: {e}")
    roc_auc = None

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
if roc_auc is not None:
    print(f"ROC-AUC: {roc_auc:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


First few rows of the dataset:
        age                                       Age in years
0    Gender                       Gender ; Male - 1, Female -0
1        cp                                    Chest pain type
2  trestbps                             Resting blood pressure
3      chol                                cholesterol measure
4       fbs  (fasting blood sugar > 120 mg/dl) (1 = true; 0...
Columns in DataFrame: Index(['age', 'Age in years'], dtype='object')
Error calculating ROC-AUC: Number of classes in y_true not equal to the number of columns in 'y_score'
Accuracy: 0.00
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
Confusion Matrix:
[[0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 1 0 0]]
Classification Report:
                                                                                                                                                                                                                                    

# Task 5: Hyperparameter Tuning

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
file_path = r"C:\Users\sai\OneDrive\Documents\heart_disease.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows and columns of the dataset
print("First few rows of the dataset:")
print(df.head())
print("Columns in DataFrame:", df.columns)

# Define the target column (replace with the actual target column name)
target_column = df.columns[-1]  # Assuming the last column is the target column

# Split the dataset into features (X) and target (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Convert categorical data into numerical data
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Initialize the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Use ShuffleSplit to handle small class sizes
shuffle_split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=shuffle_split, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters found:")
print(best_params)

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# ROC-AUC score requires probability estimates
y_score = best_model.predict_proba(X_test)

# Check the number of unique classes in y
n_classes = len(np.unique(y))

if n_classes == 2:  # Binary classification
    roc_auc = roc_auc_score(y_test, y_score[:, 1])
else:  # Multiclass classification
    # Ensure that y_score contains probabilities for all classes
    if y_score.shape[1] == n_classes:
        roc_auc = roc_auc_score(y_test, y_score, multi_class='ovr')
    else:
        roc_auc = "ROC-AUC cannot be computed due to a mismatch in class probabilities."

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
if isinstance(roc_auc, str):
    print(f"ROC-AUC: {roc_auc}")
else:
    print(f"ROC-AUC: {roc_auc:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


First few rows of the dataset:
        age                                       Age in years
0    Gender                       Gender ; Male - 1, Female -0
1        cp                                    Chest pain type
2  trestbps                             Resting blood pressure
3      chol                                cholesterol measure
4       fbs  (fasting blood sugar > 120 mg/dl) (1 = true; 0...
Columns in DataFrame: Index(['age', 'Age in years'], dtype='object')
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best parameters found:
{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 0.00
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
ROC-AUC: ROC-AUC cannot be computed due to a mismatch in class probabilities.
Confusion Matrix:
[[0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 1 0 0]]
Classification Report:
                                                        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Task 6: Model Evaluation and Analysis


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Load the dataset
file_path = r"C:\Users\sai\OneDrive\Documents\heart_disease.xlsx"
df = pd.read_excel(file_path)

# Print column names and first few rows to verify
print(df.columns)
print(df.head())

# Update this with the actual target column name from the dataset
target_column = 'target'  # Replace with the correct column name

# Check if the target column exists
if target_column not in df.columns:
    raise KeyError(f"'{target_column}' not found in DataFrame columns.")

# Assume that the target variable is 'target'
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree Classifier model
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

# Evaluate the model's performance
y_pred = dtc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

# Visualize the decision tree structure
plt.figure(figsize=(10, 10))
plot_tree(dtc, feature_names=X.columns, class_names=[str(cls) for cls in y.unique()], filled=True)
plt.show()


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\sai\\OneDrive\\Documents\\heart_disease.xlsx'

In [None]:
Interview Questions

What are some common hyperparameters of decision tree models, and how do they affect the model's performance?
Common hyperparameters of decision tree models include:

max_depth: The maximum depth of the tree. Increasing this value can lead to overfitting, while decreasing it can lead to underfitting.
min_samples_split: The minimum number of samples required to split an internal node. Increasing this value can lead to underfitting, while decreasing it can lead to overfitting.
criterion: The criterion used to measure the quality of a split. Common criteria include Gini impurity and entropy.
These hyperparameters affect the model's performance by controlling the complexity of the tree and the quality of the splits.

What is the difference between Label encoding and One-hot encoding?
Label encoding and one-hot encoding are two common techniques used to encode categorical variables in machine learning.

Label encoding assigns a unique integer value to each category in a categorical variable. For example, if we have a categorical variable with three categories - 'A', 'B', and 'C' - label