In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# Load the dataset
file_path = '/kaggle/input/home-health-care/HHCAHPS_Provider_Jul2024.csv'
data = pd.read_csv(file_path)

In [None]:
# Display the first few rows of the dataset
print(data.head())

In [None]:
# Preprocess the data (handle missing values, encode categorical variables, etc.)
# Here we'll use SimpleImputer to fill missing values with the median (adjust as needed)
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(data.select_dtypes(include=[float, int])))
data_imputed.columns = data.select_dtypes(include=[float, int]).columns

In [None]:
# Assuming 'target_column' is the target variable (change to actual column name)
target_column = 'HHCAHPS Survey Summary Star Rating'
X = data_imputed.drop(columns=[target_column])
y = data_imputed[target_column]

In [None]:
# Standardize the features for hierarchical clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Hierarchical Clustering
linked = linkage(X_scaled, method='ward')

In [None]:
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Initialize and train the Naive Bayes classifier
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = nb.predict(X_test)
y_proba = nb.predict_proba(X_test)[:, 1]

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Additional Positive/Negative Outcome Analysis
positive_outcomes = data[data[target_column] >= 4]  # Assuming rating 4 or 5 is positive (adjust as needed)
negative_outcomes = data[data[target_column] < 4]

In [None]:
plt.figure()
plt.subplot(1, 2, 1)
sns.histplot(positive_outcomes, kde=True)
plt.title('Positive Outcomes')

plt.subplot(1, 2, 2)
sns.histplot(negative_outcomes, kde=True)
plt.title('Negative Outcomes')

plt.show()