In [None]:
import pandas as pd

In [None]:
# Load data
df = pd.read_csv("Resources/myopia.csv")
df.head()

In [None]:
new_df = df.drop("MYOPIC", axis=1)
new_df.head()

In [None]:
# null values
for column in new_df.columns:
    print(f"Column {column} has {new_df[column].isnull().sum()} null values")

In [None]:
# Scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(new_df)
scaled_data

In [None]:
# A list of the columns
new_df.columns

In [None]:
# Create a DataFrame with transformed data
Transformed_df = pd.DataFrame(scaled_data, columns = new_df.columns[:14])

Transformed_df.head()

PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Initialize PCA model
pca = PCA(n_components=0.90)

# Get two principal components for the data.
myopia_pca = pca.fit_transform(scaled_data)
myopia_pca

In [None]:
# Transform PCA data to a DataFrame
df_pca = pd.DataFrame(data=myopia_pca)
df_pca

In [None]:
# explained variance
pca.explained_variance_ratio_

t-SNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
# Initialize t-SNE model
tsne = TSNE(learning_rate=35)

In [None]:
# Reduce dimensions
tsne_features = tsne.fit_transform(df_pca)

In [None]:
# The dataset has 2 columns
tsne_features.shape

In [None]:
# The first column of transformed features
df_pca['x'] = tsne_features[:,0]

# The second column of transformed features
df_pca['y'] = tsne_features[:,1]

In [None]:
# Visualize the clusters
plt.scatter(df_pca['x'], df_pca['y'])
plt.show()

In [None]:
labels = df_pca['class']
labels.value_counts()

In [None]:
# Visualize the clusters with color
plt.scatter(df_pca['x'], df_pca['y'], c=labels)
plt.show()

K Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
inertia = []
k = list(range(1, 10))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

In [None]:
# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow["k"], df_elbow["inertia"])
plt.xticks(range(1,10))
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.title("Elbow curve for Myopia Cluster Data")
plt.show()

In [None]:
def get_clusters(k, df_pca):
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Train the model
    model.fit(df_pca)

    # Predict clusters
    predictions = model.predict(df_pca)

    # Create return DataFrame with predicted clusters
    df_pca["class"] = model.labels_

    return df_pca

In [None]:
clusters = get_clusters(5, df_pca)
clusters

In [None]:
def show_clusters(df):
    plt.scatter(x=df_pca["x"], y=df_pca["y"], c=df_pca["class"])
    plt.xlabel("Number of clusters")
    plt.ylabel("Inertia")

    plt.show()

In [None]:
show_clusters(clusters)

Recommendation: 

The patients should be clustered into 6 clusters, as both models show above. To get a better and more accurate result you should split the data into the test and training models to test and get more accurate results. A larger dataset would also help with accuracy in predicting the clusters.