In [9]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = r"C:\Users\ganes\Downloads\MLDaata.csv"
data = pd.read_csv(file_path)

# Handle missing values (if any)
data = data.dropna()

# Inspect column names
print(data.columns)

# Convert categorical variables into numerical ones
# Update the column names based on the actual column names in your dataset
data = pd.get_dummies(data, columns=['Gender', 'Smoking_Status', 'Alcohol_Consumption', 'Pre-existing_Conditions'])

# Define features and target
X = data.drop(columns=['Cholesterol_Level', 'Blood_Pressure', 'Heart_Rate', 'Has_disease'])
y = data['Has_disease']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Measure training time
start_time = time.time()
clf.fit(X_train, y_train)
training_time = time.time() - start_time

# Measure prediction time
start_time = time.time()
y_pred = clf.predict(X_test)
prediction_time = time.time() - start_time

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)

# Output results
results = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'training_time': training_time,
    'prediction_time': prediction_time,
    'confusion_matrix': conf_matrix,
    'classification_report': class_report
}

# Print the results
for key, value in results.items():
    print(f"{key}: {value}")

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Disease', 'Has Disease'], yticklabels=['No Disease', 'Has Disease'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot Feature Importance
feature_importance = clf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance')
plt.show()


Index(['id', 'first_name', 'last_name', 'Gender', 'Blood_Pressure',
       'Cholesterol_Level', 'Blood_Sugar Level', 'Heart_Rate ', 'Has_Disease',
       'Length_of_Stay', 'BMI', 'Smoking_Status', 'Alcohol_Consumption',
       'Pre-existing_Conditions'],
      dtype='object')


KeyError: "['Has_disease'] not found in axis"