In [11]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import scipy.stats as stats
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

import wrangle_zillow

In [12]:
sql_query = """
    SELECT *
FROM measurements
JOIN species USING(species_id);
"""

df = pd.read_sql(sql_query, wrangle_zillow.get_connection('iris_db'))

In [13]:
df.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [14]:
wrangle_zillow.overview(df)

--- Shape: (150, 7)
--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   species_id      150 non-null    int64  
 1   measurement_id  150 non-null    int64  
 2   sepal_length    150 non-null    float64
 3   sepal_width     150 non-null    float64
 4   petal_length    150 non-null    float64
 5   petal_width     150 non-null    float64
 6   species_name    150 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 8.3+ KB
--- Column Descriptions
        species_id  measurement_id  sepal_length  sepal_width  petal_length  \
count   150.000000      150.000000    150.000000   150.000000    150.000000   
unique         NaN             NaN           NaN          NaN           NaN   
top            NaN             NaN           NaN          NaN           NaN   
freq           NaN             NaN           NaN          Na

In [15]:
X_train = df[['petal_length', 'petal_width']]
X_train.head()

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2


In [17]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_train)

clusters = kmeans.predict(X_train)
clusters

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [18]:
df['cluster'] = clusters
df.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name,cluster
0,1,1,5.1,3.5,1.4,0.2,setosa,0
1,1,2,4.9,3.0,1.4,0.2,setosa,0
2,1,3,4.7,3.2,1.3,0.2,setosa,0
3,1,4,4.6,3.1,1.5,0.2,setosa,0
4,1,5,5.0,3.6,1.4,0.2,setosa,0
