In [1]:
# Importing all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import linkage, dendrogram

# Importing additional libraries for advanced tasks
try:
    import plotly.express as px
    import imblearn
    from feature_engine import imputation
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    import missingno as msno
    import statsmodels.api as sm
except ImportError:
    print("Optional libraries not installed. Install them if needed.")

# All libraries are now ready to use
print("Libraries imported successfully!")

# Importing the warnings library to ignore warnings
import warnings
warnings.filterwarnings('ignore')

print("Warnings will now be ignored!")

Libraries imported successfully!


In [2]:
def load_data(file_path):
    """
    Function to load a dataset from a specified file path.
    
    Parameters:
    - file_path (str): The file path of the dataset.
    
    Returns:
    - data (DataFrame): Loaded dataset as a pandas DataFrame.
    """
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully! Shape: {data.shape}")
        return data
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
    except pd.errors.ParserError:
        print("Error: File could not be parsed. Please check the file format.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

#usage 
diabetes_dataset = load_data('cleaned_diabetic_data.csv')
IDS_mapping_dataset = load_data('cleaned_IDS_mapping.csv')

Data loaded successfully! Shape: (101766, 53)
Data loaded successfully! Shape: (58, 2)


In [3]:
# Replace missing values ("?") with NaN for easier handling
diabetes_dataset.replace("?", pd.NA, inplace=True)

# Check for missing values and their counts
missing_values = diabetes_dataset.isna().sum()

# Drop columns with excessive missing values if necessary
columns_to_drop = [col for col in diabetes_dataset.columns if diabetes_dataset[col].isna().mean() > 0.3]
diabetes_dataset_cleaned = diabetes_dataset.drop(columns=columns_to_drop, axis=1)

# Display the columns dropped and the updated missing value count
cleaned_missing_values = diabetes_dataset_cleaned.isna().sum()

missing_values 

encounter_id                      0
patient_nbr                       0
race                              0
gender                            0
age                               0
weight                            0
admission_type_id                 0
discharge_disposition_id          0
admission_source_id               0
time_in_hospital                  0
payer_code                        0
medical_specialty                 0
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency                  0
number_inpatient                  0
diag_1                           21
diag_2                          358
diag_3                         1423
number_diagnoses                  0
max_glu_serum                 96420
A1Cresult                     84748
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide              

In [4]:
diabetes_dataset_cleaned.shape

(101766, 51)

In [5]:
columns_to_drop

['max_glu_serum', 'A1Cresult']

In [6]:
cleaned_missing_values

encounter_id                     0
patient_nbr                      0
race                             0
gender                           0
age                              0
weight                           0
admission_type_id                0
discharge_disposition_id         0
admission_source_id              0
time_in_hospital                 0
payer_code                       0
medical_specialty                0
num_lab_procedures               0
num_procedures                   0
num_medications                  0
number_outpatient                0
number_emergency                 0
number_inpatient                 0
diag_1                          21
diag_2                         358
diag_3                        1423
number_diagnoses                 0
metformin                        0
repaglinide                      0
nateglinide                      0
chlorpropamide                   0
glimepiride                      0
acetohexamide                    0
glipizide           

In [7]:
diabetes_dataset_cleaned.shape

(101766, 51)

In [8]:
data_dropped = diabetes_dataset_cleaned.dropna()

# Verify the shape and ensure no missing values remain
print("Shape After Dropping Missing Data:", data_dropped.shape)
print("Remaining Missing Values:", data_dropped.isnull().sum().sum())

Shape After Dropping Missing Data: (100244, 51)
Remaining Missing Values: 0


In [9]:
data_dropped.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_description,weight_flag,outlier_flag
1,149190,55629189,Caucasian,Female,15,Unknown,1,1,7,3,...,No,No,No,No,Change,Yes,Readmitted,Physician Referral,Unknown,False
2,64410,86047875,AfricanAmerican,Female,25,Unknown,1,1,7,2,...,No,No,No,No,No Change,Yes,Not Readmitted,Physician Referral,Unknown,False
3,500364,82442376,Caucasian,Male,35,Unknown,1,1,7,2,...,No,No,No,No,Change,Yes,Not Readmitted,Physician Referral,Unknown,False
4,16680,42519267,Caucasian,Male,45,Unknown,1,1,7,1,...,No,No,No,No,Change,Yes,Not Readmitted,Physician Referral,Unknown,False
5,35754,82637451,Caucasian,Male,55,Unknown,2,1,2,3,...,No,No,No,No,No Change,Yes,Readmitted,Clinic Referral,Unknown,False


In [10]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100244 entries, 1 to 101765
Data columns (total 51 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   encounter_id                100244 non-null  int64 
 1   patient_nbr                 100244 non-null  int64 
 2   race                        100244 non-null  object
 3   gender                      100244 non-null  object
 4   age                         100244 non-null  int64 
 5   weight                      100244 non-null  object
 6   admission_type_id           100244 non-null  int64 
 7   discharge_disposition_id    100244 non-null  int64 
 8   admission_source_id         100244 non-null  int64 
 9   time_in_hospital            100244 non-null  int64 
 10  payer_code                  100244 non-null  object
 11  medical_specialty           100244 non-null  object
 12  num_lab_procedures          100244 non-null  int64 
 13  num_procedures              100244

In [11]:
# Identify categorical columns
categorical_columns = data_dropped.select_dtypes(include=['object', 'category']).columns

# Factorize all categorical columns
for col in categorical_columns:
    data_dropped[col], _ = pd.factorize(data_dropped[col])

# Check the result
print("Factorized DataFrame:")
data_dropped.head()

Factorized DataFrame:


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_description,weight_flag,outlier_flag
1,149190,55629189,0,0,15,0,1,1,7,3,...,0,0,0,0,0,0,0,0,0,False
2,64410,86047875,1,0,25,0,1,1,7,2,...,0,0,0,0,1,0,1,0,0,False
3,500364,82442376,0,1,35,0,1,1,7,2,...,0,0,0,0,0,0,1,0,0,False
4,16680,42519267,0,1,45,0,1,1,7,1,...,0,0,0,0,0,0,1,0,0,False
5,35754,82637451,0,1,55,0,2,1,2,3,...,0,0,0,0,1,0,0,1,0,False


In [12]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100244 entries, 1 to 101765
Data columns (total 51 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   encounter_id                100244 non-null  int64
 1   patient_nbr                 100244 non-null  int64
 2   race                        100244 non-null  int64
 3   gender                      100244 non-null  int64
 4   age                         100244 non-null  int64
 5   weight                      100244 non-null  int64
 6   admission_type_id           100244 non-null  int64
 7   discharge_disposition_id    100244 non-null  int64
 8   admission_source_id         100244 non-null  int64
 9   time_in_hospital            100244 non-null  int64
 10  payer_code                  100244 non-null  int64
 11  medical_specialty           100244 non-null  int64
 12  num_lab_procedures          100244 non-null  int64
 13  num_procedures              100244 non-null  int6

In [14]:
# Summarize the dataset by examining distributions, data types, and missing values

# Data Types
data_types_summary = data_dropped.dtypes.value_counts()

# Missing Values Count and Percentage
missing_values_count = data_dropped.isnull().sum()
missing_values_percentage = (missing_values_count / len(data_dropped)) * 100

# Distributions: Summary statistics for numerical features
numerical_features = data_dropped.select_dtypes(include=['int64', 'float64']).columns
numerical_summary = data_dropped[numerical_features].describe()

# Distributions: Categorical features (value counts)
categorical_features = data_dropped.select_dtypes(include=['object', 'category']).columns
categorical_summary = {feature: diabetic_data[feature].value_counts().head() for feature in categorical_features}

# Relevant Variables
classification_target = "readmitted"  # Predicting readmission
clustering_features = [
    "race", "gender", "age", "num_lab_procedures", "num_medications",
    "time_in_hospital", "number_diagnoses", "number_inpatient",
    "number_outpatient", "number_emergency"
]

# Display results
exploration_results = {
    "Data Types Summary": data_types_summary,
    "Missing Values Count": missing_values_count,
    "Missing Values Percentage": missing_values_percentage,
    "Numerical Features Summary": numerical_summary,
    "Categorical Features Summary (Top 5 Values)": categorical_summary,
    "Classification Target": classification_target,
    "Clustering Features": clustering_features,
}

exploration_results

{'Data Types Summary': int64    50
 bool      1
 Name: count, dtype: int64,
 'Missing Values Count': encounter_id                  0
 patient_nbr                   0
 race                          0
 gender                        0
 age                           0
 weight                        0
 admission_type_id             0
 discharge_disposition_id      0
 admission_source_id           0
 time_in_hospital              0
 payer_code                    0
 medical_specialty             0
 num_lab_procedures            0
 num_procedures                0
 num_medications               0
 number_outpatient             0
 number_emergency              0
 number_inpatient              0
 diag_1                        0
 diag_2                        0
 diag_3                        0
 number_diagnoses              0
 metformin                     0
 repaglinide                   0
 nateglinide                   0
 chlorpropamide                0
 glimepiride                   0
 acetohex

In [18]:
# Detect and handle outliers using the IQR method for numerical features
numerical_features = data_dropped.select_dtypes(include=['int64', 'float64']).columns

# Calculate IQR for each numerical feature
Q1 = data_dropped[numerical_features].quantile(0.25)
Q3 = data_dropped[numerical_features].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Clip outliers to the bounds
data_dropped[numerical_features] = data_dropped[numerical_features].clip(
    lower=lower_bound, upper=upper_bound, axis=1
)

# Check the cleaned dataset
cleaned_data_info = {
    "Shape After Cleaning": data_dropped.shape,
    "Remaining Features": data_dropped.columns.tolist(),
    "Missing Values After Cleaning": data_dropped.isnull().sum()
}

cleaned_data_info

{'Shape After Cleaning': (100244, 51),
 'Remaining Features': ['encounter_id',
  'patient_nbr',
  'race',
  'gender',
  'age',
  'weight',
  'admission_type_id',
  'discharge_disposition_id',
  'admission_source_id',
  'time_in_hospital',
  'payer_code',
  'medical_specialty',
  'num_lab_procedures',
  'num_procedures',
  'num_medications',
  'number_outpatient',
  'number_emergency',
  'number_inpatient',
  'diag_1',
  'diag_2',
  'diag_3',
  'number_diagnoses',
  'metformin',
  'repaglinide',
  'nateglinide',
  'chlorpropamide',
  'glimepiride',
  'acetohexamide',
  'glipizide',
  'glyburide',
  'tolbutamide',
  'pioglitazone',
  'rosiglitazone',
  'acarbose',
  'miglitol',
  'troglitazone',
  'tolazamide',
  'examide',
  'citoglipton',
  'insulin',
  'glyburide-metformin',
  'glipizide-metformin',
  'glimepiride-pioglitazone',
  'metformin-rosiglitazone',
  'metformin-pioglitazone',
  'change',
  'diabetesMed',
  'readmitted',
  'admission_type_description',
  'weight_flag',
  'outl

In [20]:
# Map 'admission_type_id' to descriptions using the IDS mapping file
ids_mapping_dict = IDS_mapping_dataset.set_index('admission_type_id')['description'].to_dict()
data_dropped['admission_type_description'] = data_dropped['admission_type_id'].map(ids_mapping_dict)

# One-hot encode categorical features
categorical_features = ['race', 'gender', 'admission_type_description', 'change', 'diabetesMed', 'readmitted']
diabetic_data_encoded = pd.get_dummies(data_dropped, columns=categorical_features, drop_first=True)

# Normalize numerical features using Min-Max Scaling
numerical_features = diabetic_data_encoded.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
diabetic_data_encoded[numerical_features] = scaler.fit_transform(diabetic_data_encoded[numerical_features])

# Check the transformed dataset
feature_engineering_info = {
    "Shape After Encoding and Normalization": diabetic_data_encoded.shape,
    "Sample of Encoded Columns": diabetic_data_encoded.columns[:10].tolist(),
    "First Few Rows (Preview)": diabetic_data_encoded.head()
}

feature_engineering_info

{'Shape After Encoding and Normalization': (100244, 57),
 'Sample of Encoded Columns': ['encounter_id',
  'patient_nbr',
  'age',
  'weight',
  'admission_type_id',
  'discharge_disposition_id',
  'admission_source_id',
  'time_in_hospital',
  'payer_code',
  'medical_specialty'],
 'First Few Rows (Preview)':    encounter_id  patient_nbr       age  weight  admission_type_id  \
 1      0.000308     0.301886  0.000000     0.0                0.0   
 2      0.000117     0.466962  0.000000     0.0                0.0   
 3      0.001099     0.447395  0.142857     0.0                0.0   
 4      0.000009     0.230742  0.285714     0.0                0.0   
 5      0.000052     0.448454  0.428571     0.0                0.2   
 
    discharge_disposition_id  admission_source_id  time_in_hospital  \
 1                       0.0             0.400000          0.181818   
 2                       0.0             0.400000          0.090909   
 3                       0.0             0.400000      

In [22]:
diabetic_data_encoded

Unnamed: 0,encounter_id,patient_nbr,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,race_2.5,gender_1,gender_2,admission_type_description_ Transfer from a Skilled Nursing Facility (SNF),admission_type_description_ Transfer from another health care facility,admission_type_description_Clinic Referral,admission_type_description_HMO Referral,admission_type_description_Transfer from a hospital,change_1,readmitted_1
1,0.000308,0.301886,0.000000,0.0,0.0,0.000000,0.400000,0.181818,0.000000,0.000000,...,False,False,False,False,False,False,False,False,False,False
2,0.000117,0.466962,0.000000,0.0,0.0,0.000000,0.400000,0.090909,0.000000,0.000000,...,False,False,False,False,False,False,False,False,True,True
3,0.001099,0.447395,0.142857,0.0,0.0,0.000000,0.400000,0.090909,0.000000,0.000000,...,False,True,False,False,False,False,False,False,False,True
4,0.000009,0.230742,0.285714,0.0,0.0,0.000000,0.400000,0.000000,0.000000,0.000000,...,False,True,False,False,False,False,False,False,False,True
5,0.000052,0.448454,0.428571,0.0,0.2,0.000000,0.066667,0.181818,0.000000,0.000000,...,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0.999956,0.543558,0.714286,0.0,0.0,0.266667,0.400000,0.181818,0.133333,0.000000,...,False,True,False,False,False,False,False,False,False,False
101762,0.999956,0.405348,0.857143,0.0,0.0,0.400000,0.266667,0.363636,0.133333,0.000000,...,False,False,False,False,False,False,False,False,True,True
101763,0.999971,0.222979,0.714286,0.0,0.0,0.000000,0.400000,0.000000,0.133333,0.000000,...,False,True,False,False,False,False,False,False,False,True
101764,0.999977,0.171994,0.857143,0.0,0.2,0.266667,0.400000,0.818182,0.133333,0.533333,...,False,False,False,False,False,True,False,False,False,True


In [24]:
# Save the transformed dataset to a CSV file
diabetic_data_encoded.to_csv('diabetic_data_transformed.csv', index=False)

print("Transformed dataset saved as 'diabetic_data_transformed.csv'.")

Transformed dataset saved as 'diabetic_data_transformed.csv'.


In [26]:
X = diabetic_data_encoded.drop(columns=["readmitted_1"])  
y = diabetic_data_encoded["readmitted_1"]  

In [28]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [30]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [32]:
print("Training Set Shape:", X_train.shape)
print("Validation Set Shape:", X_val.shape)
print("Testing Set Shape:", X_test.shape)

Training Set Shape: (70170, 56)
Validation Set Shape: (15037, 56)
Testing Set Shape: (15037, 56)


In [34]:
print("Training Class Balance:", y_train.value_counts(normalize=True))
print("Validation Class Balance:", y_val.value_counts(normalize=True))
print("Testing Class Balance:", y_test.value_counts(normalize=True))

Training Class Balance: readmitted_1
True     0.536896
False    0.463104
Name: proportion, dtype: float64
Validation Class Balance: readmitted_1
True     0.536942
False    0.463058
Name: proportion, dtype: float64
Testing Class Balance: readmitted_1
True     0.536876
False    0.463124
Name: proportion, dtype: float64


In [107]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Neural Network (MLP)": MLPClassifier(max_iter=500, random_state=42),
}

In [109]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Store results
results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Evaluate metrics
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_prob) if y_val_prob is not None else None
    
    # Store results
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc,
    }

# Convert results to a DataFrame
results_df = pd.DataFrame(results).T
results_df

[LightGBM] [Info] Number of positive: 37674, number of negative: 32496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1396
[LightGBM] [Info] Number of data points in the train set: 70170, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.536896 -> initscore=0.147853
[LightGBM] [Info] Start training from score 0.147853
                        Accuracy  Precision    Recall  F1-Score   ROC-AUC
Logistic Regression     0.629181   0.627710  0.760342  0.687689  0.668497
Random Forest           0.649598   0.664016  0.703245  0.683068  0.704280
Gradient Boosting       0.658243   0.664758  0.733342  0.697368  0.708158
XGBoost                 0.658310   0.675179  0.700768  0.687736  0.714894
LightGBM                0.663563   0.675148  0.719718  0.696721  0.721347


In [111]:
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,ROC-AUC
Logistic Regression,0.629181,0.62771,0.760342,0.687689,0.668497
Random Forest,0.649598,0.664016,0.703245,0.683068,0.70428
Gradient Boosting,0.658243,0.664758,0.733342,0.697368,0.708158
XGBoost,0.65831,0.675179,0.700768,0.687736,0.714894
LightGBM,0.663563,0.675148,0.719718,0.696721,0.721347
Support Vector Machine,0.638093,0.638702,0.750557,0.690126,0.686481
K-Nearest Neighbors,0.581898,0.606483,0.630295,0.61816,0.605259
Decision Tree,0.566203,0.597363,0.589299,0.593304,0.564361
Naive Bayes,0.47044,0.669725,0.027124,0.052137,0.643291
Neural Network (MLP),0.632972,0.664605,0.638841,0.651468,0.688553


In [113]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42), param_grid, cv=3, scoring="f1"
)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best F1-Score: 0.7005388343070553


In [115]:
# Predict and evaluate on test set
y_test_pred = grid_search.best_estimator_.predict(X_test)
y_test_prob = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

test_accuracy = accuracy_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_prob)

print("Test Accuracy:", test_accuracy)
print("Test ROC-AUC:", test_roc_auc)

Test Accuracy: 0.6470040566602381
Test ROC-AUC: 0.7047014914807038


In [None]:
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Subset the data for clustering (use numerical features only)
clustering_data = X_train.select_dtypes(include=['float64', 'int64'])

# Apply K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(clustering_data)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(clustering_data)

# Apply Hierarchical Clustering
linkage_matrix = linkage(clustering_data, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

# Elbow Method for K-Means
wcss = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(clustering_data)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), wcss, marker='o')
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

# Silhouette Score
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(clustering_data)
    silhouette_avg = silhouette_score(clustering_data, labels)
    print(f"For k = {k}, the Silhouette Score is {silhouette_avg}")