In [24]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Set the column width
pd.set_option('max_colwidth', 400)

In [25]:
# Read the data into a Pandas DataFrame
Diabetesindicator_df = pd.read_csv(r"C:\Users\desti\Downloads\diabetes_012_health_indicators_BRFSS2015.csv.zip")
Diabetesindicator_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [26]:
# Get a brief summary of the Diabetes DataFrame.
Diabetesindicator_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [27]:
Diabetesindicator_df.shape

(253680, 22)

In [28]:
# Scale diabetes risk factors, corelating disease and demographics 
diabetes_risk_factors = StandardScaler().fit_transform(
    Diabetesindicator_df[["Diabetes_012", "HighBP", "CholCheck", "BMI", "Smoker", "Stroke", "HeartDiseaseorAttack", "Sex", "Age", "Education", "Income" ]]
)

In [29]:
# Create a DataFrame with the scaled data
Scaled_Diabetesindicator_df = pd.DataFrame(
   diabetes_risk_factors,
    columns=["Diabetes_012", "HighBP", "CholCheck", "BMI", "Smoker", "Stroke", "HeartDiseaseorAttack", "Sex", "Age", "Education", "Income" ]
)


In [30]:
# Encode the "HeartDiseaseorAttack" column to variables to categorize HeartDisease versus Heart Attack. 
Heartdisease_dummies = pd.get_dummies(Diabetesindicator_df["HeartDiseaseorAttack"])
Heartdisease_dummies.head()

Unnamed: 0,0.0,1.0
0,True,False
1,True,False
2,True,False
3,True,False
4,True,False


In [31]:
# Concatenate the "Heartdisease_dummies" variables with the scaled data DataFrame.
Scaled_Diabetesindicator_df = pd.concat([Scaled_Diabetesindicator_df, Heartdisease_dummies], axis=1)

# Display the sample data
Scaled_Diabetesindicator_df.head()

Unnamed: 0,Diabetes_012,HighBP,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,Sex,Age,Education,Income,0.0,1.0
0,-0.425292,1.153688,0.196922,1.757936,1.120927,-0.205637,-0.322458,-0.887021,0.3169,-1.065595,-1.474487,True,False
1,-0.425292,-0.866785,-5.078164,-0.511806,1.120927,-0.205637,-0.322458,-0.887021,-0.337933,0.963272,-2.440138,True,False
2,-0.425292,1.153688,0.196922,-0.057858,-0.892119,-0.205637,-0.322458,-0.887021,0.3169,-1.065595,0.939638,True,False
3,-0.425292,1.153688,0.196922,-0.209174,-0.892119,-0.205637,-0.322458,-0.887021,0.971733,-2.080028,-0.026012,True,False
4,-0.425292,1.153688,0.196922,-0.663122,-0.892119,-0.205637,-0.322458,-0.887021,0.971733,-0.051162,-0.991662,True,False


In [32]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

In [33]:
# Fit the model for the Scaled_Diabetesindicator_df DataFrame
Scaled_Diabetesindicator_df.columns = Scaled_Diabetesindicator_df.columns.astype(str)
model.fit(Scaled_Diabetesindicator_df)

In [34]:
# Predict the model segments (clusters)
Diabetes_clusters = model.predict(Scaled_Diabetesindicator_df)

# View the diabetes segments
print(Diabetes_clusters)

[0 0 0 ... 1 0 2]


In [35]:
# Create a new column in the DataFrame with the predicted clusters
Scaled_Diabetesindicator_df["DiabetesCluster"] = Diabetes_clusters

# Review the DataFrame
Scaled_Diabetesindicator_df.head()

Unnamed: 0,Diabetes_012,HighBP,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,Sex,Age,Education,Income,0.0,1.0,DiabetesCluster
0,-0.425292,1.153688,0.196922,1.757936,1.120927,-0.205637,-0.322458,-0.887021,0.3169,-1.065595,-1.474487,True,False,0
1,-0.425292,-0.866785,-5.078164,-0.511806,1.120927,-0.205637,-0.322458,-0.887021,-0.337933,0.963272,-2.440138,True,False,0
2,-0.425292,1.153688,0.196922,-0.057858,-0.892119,-0.205637,-0.322458,-0.887021,0.3169,-1.065595,0.939638,True,False,0
3,-0.425292,1.153688,0.196922,-0.209174,-0.892119,-0.205637,-0.322458,-0.887021,0.971733,-2.080028,-0.026012,True,False,0
4,-0.425292,1.153688,0.196922,-0.663122,-0.892119,-0.205637,-0.322458,-0.887021,0.971733,-0.051162,-0.991662,True,False,0


In [36]:
import hvplot.pandas

In [37]:
# Create a scatter plot with x="CholCheck:,  y="BMI"
Scaled_Diabetesindicator_df.hvplot.scatter(
    x="CholCheck",
    y="BMI",
    by="DiabetesCluster",
    hover_cols = ["Diabetes_012"], 
    title = "Scatter Plot by Stock Segment - k=3"
)

In [38]:
# Create the PCA model instance where n_components=2
pca = PCA(n_components=2)

In [None]:
# Fit the Scaled_Diabetesindicator_df data to the PCA
diabetes_pca_data = pca.fit_transform(Scaled_Diabetesindicator_df)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
diabetes_pca_data[:5]

In [None]:
# Calculate the explained variance
pca.explained_variance_ratio_

In [None]:
# Creating a DataFrame with the PCA data
pca_data = pd.DataFrame(diabetes_pca_data, columns=["PC1", "PC2"])

# Copy the column names from the original data
pca_data["Diabetes_012"] = Diabetesindicator_df.index

# Set the Diabetes_012 column as index
pca_data = Diabetesindicator_df.set_index("Diabetes_012")

# Review the DataFrame
pca_data.head()

In [None]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the pca_data DataFrame
model.fit(pca_data)

# Predict the model segments (clusters)
diabetes_clusters = model.predict(pca_data)

# Print the diabetes segments
print(diabetes_clusters)

In [None]:
# Create a copy of the diabetes_pca_predictions DataFrame and name it as diabetes_pca_predictions
diabetes_pca_predictions = pca_data.copy()

# Create a new column in the DataFrame with the predicted clusters
diabetes_pca_predictions["diabetesCluster"] = diabetes_clusters

# Review the DataFrame
diabetes_pca_predictions.head()

In [None]:
import hvplot.pandas

In [None]:
# Create the scatter plot with x="BMI" and y="GenHlth"
diabetes_pca_predictions.hvplot.scatter(
    x="BMI",
    y="GenHlth",
    by="diabetesCluster",
    title = "Scatter Plot by Stock Segment - PCA=2"
)

In [None]:
from sklearn.cluster import KMeans
import pandas as pd

In [None]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1, 11))
# Creating a DataFrame with the PCA data

# Perfor m PCA
pca = pca_data(n_components=2)
diabetes_pca_data =pca_data.fit_transform(X)

# Create a DataFrame with the PCA data
pca_data = pd.DataFrame(diabetes_pca_data, columns=["HighBP", "BMI"])
print(pca_data)

In [None]:
# Create an empy list to store the inertia values
inertia = []

In [None]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `pca_data`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(pca_data)
    inertia.append(model.inertia_)

In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca_data= {
    "k": k,
    "inertia": inertia
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_pca_data)

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(x="k", y="inertia", title="Elbow Curve Using PCA Data", xticks=k)
elbow_pca_plot