# Plants Clustering Analysis

A look at plants in different parts of North America.

## Data Source

- [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Plants)
- USDA, NRCS. 2008. The PLANTS Database ([Web Link](http://plants.usda.gov/), 31 December 2008). National Plant Data Center, Baton Rouge, LA 70874-4490 USA.

In [15]:
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

In [45]:
# Import data
plants = pd.read_csv("./Data/plants.data", header=None, usecols=[0])
plants.columns = ["latin_name"]

plants.head()

Unnamed: 0,latin_name
0,abelia
1,abelia x grandiflora
2,abelmoschus
3,abelmoschus esculentus
4,abelmoschus moschatus


In [46]:
plants.shape

(34781, 1)

In [47]:
# Extract abbreviations to use as column names
cols = []

with open('./Data/abbrv.txt', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        cols.append(row[0])

print(cols)

['ab', 'ak', 'ar', 'az', 'ca', 'co', 'ct', 'de', 'dc', 'fl', 'ga', 'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'pr', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'vi', 'wa', 'wv', 'wi', 'wy', 'al', 'bc', 'mb', 'nb', 'lb', 'nf', 'nt', 'ns', 'nu', 'on', 'pe', 'qc', 'sk', 'yt', 'dengl', 'fraspm']


In [48]:
# Add empty columns for each State/Province to plants df
for col in cols:
    plants[col] = np.nan

plants = plants.set_index('latin_name')
plants.head()

Unnamed: 0_level_0,ab,ak,ar,az,ca,co,ct,de,dc,fl,...,nt,ns,nu,on,pe,qc,sk,yt,dengl,fraspm
latin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abelia,,,,,,,,,,,...,,,,,,,,,,
abelia x grandiflora,,,,,,,,,,,...,,,,,,,,,,
abelmoschus,,,,,,,,,,,...,,,,,,,,,,
abelmoschus esculentus,,,,,,,,,,,...,,,,,,,,,,
abelmoschus moschatus,,,,,,,,,,,...,,,,,,,,,,


In [49]:
# Extract State/Province for each plant
with open('./Data/plants.data', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        for state in row[1:]:
            plants.loc[row[0], state] = 1

In [50]:
plants.head()

Unnamed: 0_level_0,ab,ak,ar,az,ca,co,ct,de,dc,fl,...,ns,nu,on,pe,qc,sk,yt,dengl,fraspm,gl
latin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abelia,,,,,,,,,,1.0,...,,,,,,,,,,
abelia x grandiflora,,,,,,,,,,1.0,...,,,,,,,,,,
abelmoschus,,,,,,,1.0,,1.0,1.0,...,,,,,,,,,,
abelmoschus esculentus,,,,,,,1.0,,1.0,1.0,...,,,,,,,,,,
abelmoschus moschatus,,,,,,,,,,,...,,,,,,,,,,


In [54]:
# Drop non-North American columns (dengl, fraspm, gl)
plants.drop(['dengl', 'fraspm', 'gl'], axis=1, inplace=True)

# Drop rows with all missing values (plants not in NA)
plants.dropna(how='all', inplace=True)

plants.shape

(34724, 67)

In [55]:
# Fill NaNs with zeroes
plants = plants.fillna(0)
plants.head()

Unnamed: 0_level_0,ab,ak,ar,az,ca,co,ct,de,dc,fl,...,lb,nf,nt,ns,nu,on,pe,qc,sk,yt
latin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abelia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abelia x grandiflora,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abelmoschus,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abelmoschus esculentus,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abelmoschus moschatus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
