In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
file_path = "Resources/myopia.csv"
origin_myopia_df = pd.read_csv(file_path)
origin_myopia_df.head()

In [None]:
myopia_df = origin_myopia_df.drop("MYOPIC", axis = 1)
myopia_df.head()

In [None]:
scalar = StandardScaler()
scaled_data = scalar.fit_transform(myopia_df[["AGE", "SPHEQ", "AL", "ACD", "LT", "VCD", "SPORTHR", "READHR", "COMPHR", "STUDYHR", "TVHR", "DIOPTERHR", "MOMMY", "DADMY"]])

In [None]:
X = pd.DataFrame(scaled_data, columns = myopia_df.columns)
X.head()

In [None]:
pca = PCA(n_components= 0.90)
myopia_pca = pca.fit_transform(X)

myopia_pca_df = pd.DataFrame(data = myopia_pca, columns = X.columns[0:10])

In [None]:
tsne = TSNE()
tsne_features = tsne.fit_transform(myopia_pca_df)
myopia_pca_df["x"] = tsne_features[:,0]
myopia_pca_df["y"] = tsne_features[:,1]

In [None]:
plt.scatter(myopia_pca_df["x"], myopia_pca_df["y"])
plt.show()

In [None]:
k = [1,2,3,4,5,6,7,8,9,10]
inertia = []
for i in k:
    km = KMeans(n_clusters = i, random_state = 0)
    km.fit(myopia_pca_df)
    inertia.append(km.inertia_)
elbow_data = {"k":k, "inertia":inertia}
elbow_df = pd.DataFrame(elbow_data)

plt.plot(elbow_df['k'], elbow_df['inertia'])
plt.xticks(range(1,10))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

## Conclusion
After taking the myopic data and reducing its dimensionality, one can see that there are 5 distinct clusters of patients. Applying this 5 cluster model to the
bigger dataset is worth exploring as initial efforts to discover distinct patient groups using the whole dataset were unsuccessful. 