In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import pickle

iris = datasets.load_iris()

In [12]:
df_features = pd.DataFrame(data=iris["data"], columns=iris["feature_names"])
df_features

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [13]:
df_classes = pd.DataFrame(data=iris["target"], columns=["class_id"])
df_classes["name"] = df_classes["class_id"].apply(lambda x: iris["target_names"][x])
df_classes

Unnamed: 0,class_id,name
0,0,setosa
1,0,setosa
2,0,setosa
3,0,setosa
4,0,setosa
...,...,...
145,2,virginica
146,2,virginica
147,2,virginica
148,2,virginica


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_classes, test_size=10/150, random_state=42)

X_train : pd.DataFrame = X_train
y_train : pd.DataFrame = y_train
X_test : pd.DataFrame = X_test
y_test : pd.DataFrame = y_test

In [15]:
kmeans = KMeans(n_clusters=3, random_state=0, n_init="auto").fit(X_train)

In [16]:
y_train["predicted_cluster"] = kmeans.labels_
y_train.groupby(["name", "predicted_cluster"], as_index=False).agg(total=("class_id","count"))

Unnamed: 0,name,predicted_cluster,total
0,setosa,1,48
1,versicolor,0,3
2,versicolor,2,41
3,virginica,0,34
4,virginica,2,14


In [17]:
df_train = X_train.join(y_train)
px.scatter_3d(
    df_train,
    x="sepal length (cm)",
    y="petal length (cm)",
    z="sepal width (cm)",
    color="name",
    symbol="predicted_cluster",
    symbol_map={0: "cross", 1: "square", 2: "circle"},
    height=500,
)

In [18]:
import numpy as np

points = np.linspace(0, 10, 11)

data = np.stack(np.meshgrid(points, points, points, points), -1).reshape(-1, 4)
df_lattice = pd.DataFrame(data, columns=iris["feature_names"])
df_lattice["predicted_cluster"] = kmeans.predict(df_lattice)
df_lattice["predicted_cluster"] = df_lattice["predicted_cluster"].astype(str)
df_lattice

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),predicted_cluster
0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,1.0,1
2,0.0,0.0,0.0,2.0,1
3,0.0,0.0,0.0,3.0,1
4,0.0,0.0,0.0,4.0,1
...,...,...,...,...,...
14636,10.0,10.0,10.0,6.0,0
14637,10.0,10.0,10.0,7.0,0
14638,10.0,10.0,10.0,8.0,0
14639,10.0,10.0,10.0,9.0,0


In [19]:
fig = px.scatter_3d(
    df_lattice,
    x="sepal length (cm)",
    y="petal length (cm)",
    z="sepal width (cm)",
    animation_frame="petal width (cm)",
    color="predicted_cluster",
    color_discrete_map={
        "0": "blue",
        "1": "red",
        "2": "green",
    },
    symbol_map={0: "cross", 1: "square", 2: "circle"},
    height=500,
)

fig.update_traces(marker_size = 10)

In [20]:
X_test.to_csv("data/X_test.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)

with open("data/kmeans.pickle", "wb") as wfile:
    pickle.dump(kmeans, wfile)


## Bonus tasks:
1. Try running `deploy.py` on your own

---
2. Go back to cell 5 and try different algorithms provided by sklearn. See what differences they make. You can find all of them here:
    
    https://scikit-learn.org/stable/modules/clustering.html

---
3. Apart from the model, two .csv files are saved as well. Try modifying `deploy.py` so that you can pick a specific specimen from .csv file using dropdown, and in the returned text include information about which species it was. 
    
    https://www.gradio.app/docs

---
4. You can share your gradio frontend by changing

    ```
    demo.launch()
    ```
    to 
    ```
    demo.launch(share=True)
    ```
    in `deploy.py`. You can then send the public link to anyone so that they can view you app.
    Link works as long as you keep your app running on your pc