In [None]:
# Step 1: Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

# Step 2: Load the Dataset
# Make sure the file "Dentistry.csv" is in the same directory or update the path accordingly
data = pd.read_csv("Dentistry.csv")

# Step 3: Data Preprocessing
# Drop identifier columns if present
data = data.drop(columns=['SampleID', 'SL No.'], errors='ignore')

# Handle missing values
data.fillna(data.mean(numeric_only=True), inplace=True)

# Encode target variable
data['Gender'] = LabelEncoder().fit_transform(data['Gender'])  # Male = 1, Female = 0

# Split features and target
X = data.drop('Gender', axis=1)
y = data['Gender']

# Normalize feature data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X)

# Step 4: Exploratory Data Analysis (EDA)
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pd.DataFrame(X_normalized, columns=X.columns).corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# Step 5: Feature Selection (optional based on heatmap)
# (Example: drop highly correlated features manually if needed)

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Step 7: Model Building and Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)
    print("ROC AUC Score:", auc_score)
    plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

# Logistic Regression
print("--- Logistic Regression ---")
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
evaluate_model(lr_model, X_test, y_test)

# Decision Tree
print("--- Decision Tree ---")
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
evaluate_model(dt_model, X_test, y_test)

# Random Forest
print("--- Random Forest ---")
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
evaluate_model(rf_model, X_test, y_test)

# XGBoost
print("--- XGBoost ---")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_test, y_test)
