In [8]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
# https://stats.stackexchange.com/questions/183236/what-is-the-relation-between-k-means-clustering-and-pca
# https://realpython.com/k-means-clustering-python/

#-Huokostila%=(Ds-Db/Ds)*100
#-WC10 = kenttäkapasiteetin (–10 kPa matriisipotentiaalissa) vesipit. til.%
#-AFP10 =kenttäkapsiteetin ilmatila = kok.tilavuus-WC10 til.%
#-WC1500 = lakastumisrajan vesipit.til.%

# also add:
# org+clay -ratio 

# think also should 100 kPa > 10 kPa and 0.3 < 1 kPa as the lowest and higest values are measured in different methods 

In [33]:
from MaaTi_io import categorical_vars

# read data from csv-file
raw_data = pd.read_csv('wrc-calculated_BioSoil_Maati.csv')
raw_data.set_index(raw_data['site_id'])
data = raw_data.dropna()

# select columns used in clustering

cols = [
    #'depth',
    'lat_N',  # continuous
    'lon_E',  # continuos
    'altitude',  # continuous
    'soil_type',  # catergorical: 3: till; 4: sorted
    'grain_size',  # categorical: 1=clay/silt (savi, hiesu, hieno hieta); 2=silt/sand (karkea hieta, hieno hiekka); 3=sand/gravel (karkea hiekka, sora)  
    'orglayer_thickness',  # continuous
    'topography',  # categorical: 0=flat; 1=hilltop/uphill; 2=hillside; 3=downhill; 4=hollow/depression; 5=other
    'slope',  # continuous
    'hydraulic_regime',  # categorical: 1=barren heath; 2=sub-xeric; 3=xeric; 4=mesic; 5=?; 6=hydric
    'peat_forming_mosses',  # continuos
    'sitetype',  # 1=herb-rich; 2=?; 3=mesic; 4=xeric; 5=subxeric; 6=barren heath; 7=rocky or sandy areas; 8=hilltops
    'clay',
    'silt',
    'sand',
    'gravel',
    'dry_density',  # continuous
    'bulk_density',  # continuous  
    'organic_content'  # continuous
]

continuous_vars = [
    #'depth',
    'lat_N',  
    'lon_E',
    'altitude',
    'orglayer_thickness',
    'slope',
    'peat_forming_mosses',
    'clay',
    'silt',
    'sand',
    'gravel',
    'dry_density', 
    'bulk_density',
    'organic_content'
]

# preprocess categorical variables
data.replace(categorical_vars, inplace=True)
for var in categorical_vars:
    if var in data:
        data[var] = data[var].astype(dtype='category')
        
vg2par = [
    'alpha_vg2',
    'n_vg2',
    'theta-s',
    'theta-ratio'
]

vg3par = [
    'alpha_vg3',
    'n_vg3',
    'theta-r_vg3',
    'theta-s',
    'theta-ratio'
]

cols_vg2par = cols + vg2par
cols_vg3par = cols + vg3par

data_vg2 = data[cols_vg2par]
data_vg3 = data[cols_vg3par]

# row labels
labels = data['site_id'].values

data_vg2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125 entries, 24 to 238
Data columns (total 22 columns):
lat_N                  125 non-null int64
lon_E                  125 non-null int64
altitude               125 non-null int64
soil_type              125 non-null category
grain_size             125 non-null category
orglayer_thickness     125 non-null float64
topography             125 non-null category
slope                  125 non-null int64
hydraulic_regime       125 non-null category
peat_forming_mosses    125 non-null float64
sitetype               125 non-null category
clay                   125 non-null float64
silt                   125 non-null float64
sand                   125 non-null float64
gravel                 125 non-null float64
dry_density            125 non-null float64
bulk_density           125 non-null float64
organic_content        125 non-null float64
alpha_vg2              125 non-null float64
n_vg2                  125 non-null float64
theta-s          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regex=regex,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
# Preprocessing pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def kmeans_biosoil(n_clusters, name, data):

    column_transformer = ColumnTransformer(
        [('category', OneHotEncoder, list(categorical_vars.keys())),
        ('float', MinMaxScaler, continuous_vars)]
    )
    
    column_transformer.fit_transform(data_vg2)
    
    pca = PCA(n_components=n_clusters).fit(data_vg2)
    kmeans = KMeans(init=pca.components_, n_clusters=n_clusters, n_init=1)
    
    estimator = make_pipeline(PCA(), kmeans).fit()
    
    results = [name, fit_time, estimator[-1].inertia_]
    
    clustering_metrics = [
        metrics.calinski_harabasz_score,
        metrics.davies_bouldin_score
        #metrics.silhouette_score,
    ]
    
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                metric="euclidean", sample_size=300,)
    ]
    
    # Show the results
    formatter_results = ("{:9s}\t{:.3f}s\t{:.3f}\t{:.3f}")
    print(formatter_results.format(*results))
    

print('init\t\tinertia\tch-score\tdb-score\tsilhouette')
for n_clusters in range(2, 11):
    kmeans_biosoil(n_clusters, name="{}".format(n_clusters), data=data_vg2)
    


init		inertia	ch-score	db-score	silhouette


TypeError: Cannot clone object '<class 'sklearn.preprocessing._encoders.OneHotEncoder'>' (type <class 'type'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.

In [39]:
# Define our evaluation benchmark
# comparison of different initialization methods for KMeans

# Benchmark will:
# - create a pipeline which scale the data using a StandardScaler
# - train and time the pipeline fitting
# - measure the performance of the clustering obtained via different metrics

from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def bench_kmeans(kmeans, name, data, labels):
    """ Benchmark to evaluete the KMeans initialization methods.
    
    Args:
        kmeans: KMeans instance
        name: name given to the strategy
        data: ndarray of shape (nsamples,) the labels used to compute the clustering metrics which requires some supervision 
    
    """
    
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]
    
    # Define the metrics which require only the true labels and estimator
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
    
    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                metric="euclidean", sample_size=300,)
    ]
    
    # Show the results
    formatter_results = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_results.format(*results))

In [40]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

for n_clusters in range(2, 11):

    pca = PCA(n_components=n_clusters).fit(data_vg2)
    kmeans = KMeans(init=pca.components_, n_clusters=n_clusters, n_init=1)
    bench_kmeans(kmeans=kmeans, name="{}".format(n_clusters), data=data_vg2, labels=labels)


init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
2        	0.008s	2024	0.156	0.886	0.266	0.021	0.091	0.282
3        	0.007s	1725	0.246	0.944	0.391	0.037	0.137	0.077
4        	0.007s	1573	0.282	0.846	0.422	0.041	0.128	0.041
5        	0.004s	1487	0.336	0.879	0.486	0.057	0.156	0.053
6        	0.005s	1318	0.351	0.818	0.491	0.057	0.137	-0.059
7        	0.005s	1225	0.401	0.849	0.545	0.076	0.165	-0.075
8        	0.004s	1113	0.435	0.849	0.576	0.090	0.174	-0.093
9        	0.005s	1042	0.470	0.866	0.609	0.106	0.194	-0.149
10       	0.005s	1024	0.497	0.878	0.635	0.121	0.211	-0.111


