In this notebook, we are going to see an example of Clustering (K-means) and dimensionality reduction (PCA) on a water quality dataset from MM's Case dell'acqua.

In [299]:
import os
import pandas as pd
import numpy as np
import random

from IPython.display import display, HTML

In [300]:
# for reproducibility
np.random.seed(42)
random.seed(42)

In [301]:
from warnings import filterwarnings
filterwarnings('ignore')

# Load Data

In [302]:
# define data path
data_path = 'data'

In [303]:
# load the data
grab_df = pd.read_excel(os.path.join(data_path, 'grab.xlsx'))

In [None]:
# the dataset has two columns related to each parameter, one for the measurement 
# and one for the label related to the measurement, which can be:
# - "Normal": the measurement has a valid value
# - "Less than": the measurement is less than the LOQ
# - "NaN": the measurement is missing

grab_df

In [305]:
# since the label columns have the name in italian, we will rename them in english
feature_mapping = {
    "Cloro residuo libero (al prelievo) (mg/L di Cl2)": "Free Chlorine (mg/L)",
    "Colore (Cu)": "Color (CU)",
    "Concentrazione ioni idrogeno (unità pH)": "pH",
    "Conduttività a 20°C (µS/cm)": "Conductivity (uS/cm)",
    "TOC - carbonio organico totale (mg/L di C)": "TOC (mg/L)",
    "Temperatura (al prelievo) (°C)": "Temperature (°C)",
    "Nitrati (mg/L)": "Nitrate (mg/L)",
}

targets_mapping = {
    "Batteri coliformi a 37°C (MPN/100 mL)": "Coliforms (MPN/100mL)",
    "Bromodiclorometano (µg/L)": "Bromodichloromethane (µg/L)",
    "Bromoformio (µg/L)": "Bromoform (µg/L)",
    "Cloroformio (µg/L)": "Chloroform (µg/L)",
    "Conta delle colonie a 22°C (UFC/mL)": "Colony count at 22°C (UFC/mL)",
    "Conteggio colonie a 30°C (UFC/mL)": "Colony count at 30°C (UFC/mL)",
    "Conta delle colonie a 37°C (UFC/mL)": "Colony count at 37°C (UFC/mL)",
    "Dibromoclorometano (µg/L)": "Dibromochloromethane (µg/L)",
    "Enterococchi (MPN/100 mL)": "Enterococci (MPN/100mL)",
    "Escherichia coli (MPN/100 mL)": "Escherichia coli (MPN/100mL)",
    "Pseudomonas aeruginosa (UFC/250 mL)": "Pseudomonas aeruginosa (UFC/250mL)",
    "Acido Perfluoroottanoico PFOA (µg/L)": "Perfluorooctanoic acid PFOA (µg/L)",
    "Acido Perfluoroottansolfonico PFOS (µg/L)": "Perfluorooctanesulfonic acid PFOS (µg/L)",
    "Somma di PFAS (µg/L)": "Sum of PFAS (µg/L)",
}

In [306]:
from operator import contains

# rename the label columns
for column in grab_df.columns:
    if contains(column, "label"):
        variable_name = column.split("_")[0]

        if variable_name in feature_mapping:
            new_name = feature_mapping[variable_name]
            new_name = new_name + "_" + column.split("_")[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

        if variable_name in targets_mapping:
            new_name = targets_mapping[variable_name]
            new_name = new_name + "_" + column.split("_")[1]
            grab_df.rename(columns={column: new_name}, inplace=True)

In [307]:
# the target variables that we are going to use are THMs
thms_columns = [
    'Bromodichloromethane (µg/L)',
    'Bromoform (µg/L)',
    'Chloroform (µg/L)',
    'Dibromochloromethane (µg/L)'
]

# Data Summary

In [308]:
# create a dataframe to store the summary of the data
info_df  = pd.DataFrame(
    columns=pd.MultiIndex.from_product(
        [
            list(feature_mapping.values()) + list(targets_mapping.values()),
            [
                "N° Entries",
                "N° Valid Samples",
                "% Missing",
                "N° < LOQ",
                "Mean Valid",
                "Std Valid",
                "LOQ values",
                "Start Date",
                "End Date",
            ],
        ]
    ),
    index=grab_df["Code"].unique(),
)

In [309]:
# compute the summary of the data
for code in grab_df["Code"].unique():
    for feature in list(feature_mapping.values()) + list(targets_mapping.values()):
        df = grab_df[grab_df["Code"] == code][
            ["DateTime", feature, feature + "_label"]
        ].copy()

        if df.dropna().shape[0] == 0:
            continue

        df["DateTime"] = pd.to_datetime(df["DateTime"])

        start_date = df.dropna()["DateTime"].min().strftime("%Y-%m-%d")
        end_date = df.dropna()["DateTime"].max().strftime("%Y-%m-%d")

        df = df[(df["DateTime"] >= start_date) & (df["DateTime"] <= end_date)]

        missing_values = (
            df[df[feature + "_label"].isna()].shape[0] / df.shape[0] * 100
        )

        info_df.loc[code, (feature, "N° Entries")] = df.shape[0]

        info_df.loc[code, (feature, "% Missing")] = round(missing_values, 2)

        info_df.loc[code, (feature, "N° < LOQ")] = df[
            df[feature + "_label"] == "Less than"
        ].shape[0]
        
        
        valid_df = df[df[feature + "_label"] == "Normal"]
        loq_df = df[df[feature + "_label"] == "Less than"]
        
        info_df.loc[code, (feature, "N° Valid Samples")] = valid_df.shape[0]
        info_df.loc[code, (feature, "N° < LOQ")] = loq_df.shape[0]
        

        info_df.loc[code, (feature, "Mean Valid")] = round(valid_df[feature].mean(), 2)
        info_df.loc[code, (feature, "Std Valid")] = round(valid_df[feature].std(), 2)
        
        loq_values = loq_df[feature].unique() * 2
        loq_values = [str(value) for value in loq_values]
        info_df.loc[code, (feature, "LOQ values")] = ", ".join(loq_values)

        info_df.loc[code, (feature, "Start Date")] = start_date
        info_df.loc[code, (feature, "End Date")] = end_date

In [310]:
info_df.sort_index(inplace=True)

In [311]:
# sort the first level of the columns and maintain the order of the second level
info_df = info_df.sort_index(axis=1, level=0, sort_remaining=False, key=lambda x: x.str.lower())

In [None]:
display(HTML(info_df.to_html()))

# Data Imputation

We can see that there are parameters that have a lot of missing values, and some of them have a lot of <LOQ values.

We are going to perform two types of imputation:
- for the parameters that have measurements below the LOQ, we will impute the missing values with the LOQ/2
- for the parameters that have NaN values, we will impute them through [MICE](https://miceforest.readthedocs.io/en/latest/) (Multiple Imputation by Chained Equations). 

MICE ‘fills in’ (imputes) missing data in a dataset through an iterative series of predictive models. In each iteration, each specified variable in the dataset is imputed using the other variables in the dataset. These iterations should be run until it appears that convergence has been met.

Keep in mind that we can only impute the input variables, not the target ones because we would introduce bias and misinformation in the dataset.


In [313]:
# define a function to replace the LOQ values with the LOQ/2
def replace_loq(row, column):
    return row[column] if row[column + "_label"] != "Less than" else row[column] / 2

In [314]:
# get the columns that contain 'label'
label_columns = [col for col in grab_df.columns if 'label' in col]

In [315]:
for column in grab_df.columns.difference(["Code", "DateTime"] + label_columns):
    grab_df[column] = grab_df.apply(
        lambda row: replace_loq(row, column), axis=1
    )

In [316]:
# remove the label columns
grab_df = grab_df.drop(columns=label_columns)

In [317]:
# keep only the columns we are interested in
grab_df = grab_df[["DateTime", "Code"] + list(feature_mapping.values()) + thms_columns]

In [None]:
grab_df

Now that we set the LOQ values to the valid measurements, we can impute the missing values.

In [319]:
input_df = grab_df[list(feature_mapping.values()) + ['DateTime'] + ['Code']].copy()

In [320]:
# check if there are rows with all NaN

for code in grab_df["Code"].unique():
    if input_df[input_df['Code'] == code].isnull().all(axis=1).sum() > 0: # Rows with all NaN
        print(f'{code} has {input_df[input_df["Code"] == code].isnull().all(axis=1).sum()} rows with all NaN')

No rows with all NaN

In [321]:
# check if there are columns with all NaN
for code in grab_df["Code"].unique():
    if input_df[input_df['Code'] == code].isnull().all(axis=0).sum() > 0: # Columns with all NaN
        print(f'{code} has {input_df[input_df["Code"] == code].isnull().all(axis=0).sum()} columns with all NaN')

No columns with all NaN

In [322]:
# in order to impute the missing values, we need to convert the datetime column to a float
# and the code column to a category
input_df['DateTime'] = pd.to_numeric(input_df['DateTime'])
input_df['Code'] = input_df['Code'].astype('category')

input_df = input_df.reset_index(drop=True)

In [323]:
import miceforest as mf

In [None]:
# create a kernel dataset, able to perform MICE on itself
kernel = mf.ImputationKernel(
    data=input_df,
    variable_schema=input_df.columns.difference(['DateTime', 'Code']).to_list(),
    random_state=42,
    mean_match_strategy='shap',
)

kernel.mice(5, verbose=True)

In [325]:
# get the completed dataset
completed_dataset = kernel.complete_data(dataset=0)

In [None]:
completed_dataset

In [None]:
# check if the distribution of the imputed values is similar to the distribution of the true values
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, entropy

for feature in feature_mapping.values():
    sns.histplot(completed_dataset[feature], label='Imputed', kde=True, stat='density')
    sns.histplot(grab_df[feature], label='True', kde=True, stat='density')
    
    # compare the similarity of the distributions, keeping in mind that the non-imputed values are nan 
    # so we need to remove them from the comparison
    # First fit the distributions
    kde_completed = gaussian_kde(completed_dataset[feature])
    kde_grab = gaussian_kde(grab_df[feature].dropna())
    
    # create a grid of points to evaluate the distributions
    x_vals = np.linspace(min(completed_dataset[feature].min(), grab_df[feature].dropna().min()), max(completed_dataset[feature].max(), grab_df[feature].dropna().max()), 1000)
    kde_completed_vals = kde_completed.pdf(x_vals)
    kde_grab_vals = kde_grab.pdf(x_vals)
    
    # normalize the pdfs
    kde_completed_vals = kde_completed_vals / kde_completed_vals.sum()
    kde_grab_vals = kde_grab_vals / kde_grab_vals.sum()
    
    # compute the kl divergence between the imputed and the true values
    kl_div_value = entropy(kde_completed_vals, kde_grab_vals)
    
    plt.title(f'{feature} - KL Divergence: {kl_div_value:.2f}')
    plt.legend()
    plt.show()



All the distributions look similar. As a rule of thumb, if the KL divergence is less than 0.5, the distributions are considered similar

In [328]:
# add the thms columns
completed_dataset[thms_columns] = grab_df[thms_columns]

In [329]:
grab_df = completed_dataset

In [None]:
grab_df

In [None]:
# Unfortunatly, there are missing values in the thms columns, so we need to drop them
grab_df.dropna(subset=thms_columns, inplace=True)
grab_df.shape

In [332]:
grab_df.reset_index(drop=True, inplace=True)

In [None]:
grab_df.head()

We now have a small amount of data, so it would be useful to cluster data points coming from different drinking water supply points.

# Clustering

We are going to use two clustering algorithms to cluster the data points coming from different supply points.

The first version is the hierarchical clustering, where we use the gower distance to cluster the data points.

The second version is constrained K-means, where we add the constraint that the data points coming from the same supply points must be in the same cluster.

Since K-means requires the number of clusters to be specified, we will use the different metrics to find the optimal number of clusters.
- intra-cluster variance weighted by the number of points in the cluster (the lower the better);
- silhouette score: how similar an object is to its own cluster (cohesion) compared to other clusters (separation) (the higher the better);
- davies-bouldin index: measures the average similarity between each cluster and the most similar cluster (the lower the better).

## Hierarchical Clustering

In [334]:
from scipy.cluster.hierarchy import linkage, dendrogram
from gower import gower_matrix

In [335]:
# remove the datetime column as it is not a feature and could wrongly bias the clustering
df = grab_df[grab_df.columns.difference(['DateTime'])]
df["Code"] = df["Code"].astype('str')

In [None]:
distance_matrix = gower_matrix(df)

# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='ward')

# Plot dendrogram
plt.figure(figsize=(8, 5))
dendrogram(linkage_matrix, labels=df.index, leaf_rotation=90)

plt.title("Hierarchical Clustering with Gower's Distance")
plt.show()

Let's find the optimal number of clusters by looking at the silhouette score.

In [337]:
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score
scores = []

for k in range(2, 10): 
    clusters = fcluster(linkage_matrix, k, criterion='maxclust')
    score = silhouette_score(distance_matrix, clusters, metric='precomputed')
    scores.append(score)

In [None]:
plt.plot(range(2, 10), scores, label='Silhouette Score')


plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Scores')

plt.legend()

plt.show()

Let's see if the clusters are coherent with the DW supply points.

In [339]:
cluster0_indexes = [4, 6, 18, 42]
cluster1_indexes = grab_df.index.difference(cluster0_indexes)

In [340]:
cluster0_df = grab_df.iloc[cluster0_indexes]
cluster1_df = grab_df.iloc[cluster1_indexes]

In [None]:
cluster0_df

We can see that there are some points coming from the same DW supply point in both clusters, which is not what we want.

So we need to find a way to cluster the data points coming from the same DW supply point in the same cluster.

One way to do this is to use Constraint K-means, where we add the constraint that the data points coming from the same DW supply point must be in the same cluster.

## K-means

In [342]:
from sklearn.cluster import KMeans

In [343]:
# we need to drop also the code column as k-means is not able to handle categorical variables
df = grab_df[grab_df.columns.difference(['DateTime', 'Code'])]

In [344]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [345]:
from sklearn.metrics import davies_bouldin_score

variances = []
sil_scores = []
db_scores = []

# define the number of points
total_points = df.shape[0]

# compute the variance also for the case of 1 cluster
variance = np.var(df, axis=0).mean()

variances.append(variance)

for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster, random_state=12)
    clusters = kmeans.fit_predict(df)
    
    sil_score = silhouette_score(df, clusters)
    db_score = davies_bouldin_score(df, clusters)

    df['Cluster'] = clusters
    
    # compute the variance of each cluster
    variance = 0
    for cluster in df['Cluster'].unique():
        cluster_df = df[df['Cluster'] == cluster].copy()
        variance += np.var(cluster_df, axis=0).mean() * cluster_df.shape[0] / total_points
    
    # compute average variance for n_cluster
    variance /= n_cluster    
    
    variances.append(variance)
    sil_scores.append(sil_score)
    db_scores.append(db_score)
    
    df.drop(columns=['Cluster'], inplace=True)

In [None]:
# normalize the scores
variances = np.array(variances)
sil_scores = np.array(sil_scores)
db_scores = np.array(db_scores)

variances = scaler.fit_transform(variances.reshape(-1, 1)).flatten()
sil_scores = scaler.fit_transform(sil_scores.reshape(-1, 1)).flatten()
db_scores = scaler.fit_transform(db_scores.reshape(-1, 1)).flatten()

plt.plot(range(1, 10), variances, label='Weighted Average Variance')
plt.plot(range(2, 10), sil_scores, label='Silhouette Score')
plt.plot(range(2, 10), db_scores, label='Davies Bouldin Score')


plt.xlabel('Number of clusters')
plt.ylabel('(Normalized) Scores')

plt.legend()

plt.show()

The optimal number of clusters is 2. Evaluate the clustering with similarity matrix inspection.

In [347]:
from seaborn import heatmap
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(df)

clusters = kmeans.predict(df)
df['Cluster'] = clusters

distance_matrix = euclidean_distances(df)
similarity_matrix = 1 / (1 + distance_matrix)

sorted_indices = np.argsort(df['Cluster'])
sorted_matrix = similarity_matrix[sorted_indices, :][:, sorted_indices]

heatmap(sorted_matrix, cmap='viridis')
plt.show()


In [None]:
distance_matrix = euclidean_distances(df)
similarity_matrix = 1 / (1 + distance_matrix)

sorted_indices = np.random.permutation(df.index)
sorted_matrix = similarity_matrix[sorted_indices, :][:, sorted_indices]

heatmap(sorted_matrix, cmap='viridis')
plt.show()


## COP K-means

In [350]:
from copkmeans.cop_kmeans import cop_kmeans 

In [351]:
# copy the dataframe
df = grab_df.copy()

In [352]:
# define the feature + thms columns
joint_columns = list(feature_mapping.values()) + thms_columns

In [353]:
# scale the data
scaler = MinMaxScaler()
df[joint_columns] = pd.DataFrame(scaler.fit_transform(df[joint_columns]), columns=joint_columns)

In [354]:
from itertools import combinations

# define a list of constraints, where every element is a list of the form [index1, index2]
# that means that the data points with the indexes index1 and index2 must be in the same cluster
must_link = []


for code in df['Code'].unique():
    # get all the pair combinations of the entries with same code to add to the must_link
    index_pairs = list(combinations(df[df['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

# convert the dataframe to a numpy array to pass it to the cop_kmeans function
np_df = df[joint_columns].to_numpy()

In [355]:
from sklearn.metrics import davies_bouldin_score

variances = []
sil_scores = []
db_scores = []

# compute the variance also for the case of 1 cluster
variance = np.var(np_df, axis=0).mean()

variances.append(variance)

# define the number of points
total_points = df.shape[0]

for n_cluster in range(2, 10):
    clusters, centers = cop_kmeans(np_df, n_cluster, ml=must_link)
    
    sil_score = silhouette_score(np_df, clusters)
    db_score = davies_bouldin_score(np_df, clusters)

    df['Cluster'] = clusters
    
    # compute the variance of each cluster
    variance = 0
    for cluster in df['Cluster'].unique():
        cluster_df = df[df['Cluster'] == cluster].copy()
        variance += np.var(cluster_df[joint_columns], axis=0).mean() * cluster_df.shape[0] / total_points
    
    # compute average variance for n_cluster
    variance /= n_cluster    
    
    variances.append(variance)
    sil_scores.append(sil_score)
    db_scores.append(db_score)
    
    df.drop(columns=['Cluster'], inplace=True)
    

In [None]:
# normalize the scores
variances = np.array(variances)
sil_scores = np.array(sil_scores)
db_scores = np.array(db_scores)

variances = scaler.fit_transform(variances.reshape(-1, 1)).flatten()
sil_scores = scaler.fit_transform(sil_scores.reshape(-1, 1)).flatten()
db_scores = scaler.fit_transform(db_scores.reshape(-1, 1)).flatten()

plt.plot(range(1, 10), variances, label='Weighted Average Variance')
plt.plot(range(2, 10), sil_scores, label='Silhouette Score')
plt.plot(range(2, 10), db_scores, label='Davies Bouldin Score')


plt.xlabel('Number of clusters')
plt.ylabel('(Normalized) Scores')

plt.legend()

plt.show()

The silhouette score is the highest for 3 clusters, the davies-bouldin score is the lowest for 3 clusters. The weighted average variance would suggest 4 clusters, but all together the best number of clusters is 3.

In [357]:
n_clusters = 2

must_link = []


for code in df['Code'].unique():
    # get all the pair combinations of the entries with same code to add to the must_link
    index_pairs = list(combinations(df[df['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

np_df = df[joint_columns].to_numpy()

clusters, centers = cop_kmeans(np_df, n_clusters, ml=must_link,)

In [358]:
df['Cluster'] = clusters

In [None]:
for cluster in df['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = df[df['Cluster'] == cluster]['Code'].unique().tolist()
    
    print(codes)

In [None]:
distance_matrix = euclidean_distances(df[joint_columns])
similarity_matrix = 1 / (1 + distance_matrix)

sorted_indices = np.argsort(df['Cluster'])
sorted_matrix = similarity_matrix[sorted_indices, :][:, sorted_indices]

heatmap(sorted_matrix, cmap='viridis')
plt.show()

In order to have a better visualization, we can try to perform clustering on a subset of the variables in the dataset.

However, in order to lose much less information, we can perform PCA on the dataset and then perform clustering on the principal components.

## COP K-means on Temperature, Free Chlorine and TTHMs

Let's try to perform clustering on the temperature, free chlorine and TTHMs.

In [361]:
grab_df['TTHMs'] = grab_df[thms_columns].sum(axis=1)

In [362]:
subset_columns = ['Temperature (°C)', 'Free Chlorine (mg/L)', 'TTHMs']

In [363]:
df = grab_df[subset_columns + ['Code']].copy()


In [364]:
# scale the data
scaler = MinMaxScaler()
df[subset_columns] = pd.DataFrame(scaler.fit_transform(df[subset_columns]), columns=subset_columns)

In [365]:
# define the must link
must_link = []

for code in df['Code'].unique():
    index_pairs = list(combinations(df[df['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

np_df = df[subset_columns].to_numpy()


In [366]:
variances = []
sil_scores = []
db_scores = []

# compute the variance also for the case of 1 cluster
variance = np.var(np_df, axis=0).mean()

variances.append(variance)

# define the number of points
total_points = df.shape[0]

for n_cluster in range(2, 10):
    clusters, centers = cop_kmeans(np_df, n_cluster, ml=must_link)
    
    sil_score = silhouette_score(np_df, clusters)
    db_score = davies_bouldin_score(np_df, clusters)

    df['Cluster'] = clusters
    
    # compute the variance of each cluster
    variance = 0
    for cluster in df['Cluster'].unique():
        cluster_df = df[df['Cluster'] == cluster].copy()
        variance += np.var(cluster_df[subset_columns], axis=0).mean() * cluster_df.shape[0] / total_points
    
    # compute average variance for n_cluster
    variance /= n_cluster    
    
    variances.append(variance)
    sil_scores.append(sil_score)
    db_scores.append(db_score)
    
    df.drop(columns=['Cluster'], inplace=True)

In [None]:
# normalize the scores
variances = np.array(variances)
sil_scores = np.array(sil_scores)
db_scores = np.array(db_scores)

variances = scaler.fit_transform(variances.reshape(-1, 1)).flatten()
sil_scores = scaler.fit_transform(sil_scores.reshape(-1, 1)).flatten()
db_scores = scaler.fit_transform(db_scores.reshape(-1, 1)).flatten()

plt.plot(range(1, 10), variances, label='Weighted Average Variance')
plt.plot(range(2, 10), sil_scores, label='Silhouette Score')
plt.plot(range(2, 10), db_scores, label='Davies Bouldin Score')


plt.xlabel('Number of clusters')
plt.ylabel('(Normalized) Scores')

plt.legend()
plt.show()

We select 2 as the number of clusters.

In [368]:
n_clusters = 2

must_link = []


for code in df['Code'].unique():
    # get all the pair combinations of the entries with same code to add to the must_link
    index_pairs = list(combinations(df[df['Code'] == code].index, 2)) 
    must_link.extend(index_pairs)

np_df = df[subset_columns].to_numpy()

clusters, centers = cop_kmeans(np_df, n_clusters, ml=must_link,)

In [369]:
df['Cluster'] = clusters

In [None]:
for cluster in df['Cluster'].unique():
    print(f'Cluster {cluster}')
    codes = df[df['Cluster'] == cluster]['Code'].unique().tolist()
    
    print(codes)

In [None]:
# plot the clusters in a 3D scatter plot with plotly
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Scatter3d(
        x=df[df['Cluster'] == cluster]['Temperature (°C)'],
        y=df[df['Cluster'] == cluster]['Free Chlorine (mg/L)'],
        z=df[df['Cluster'] == cluster]['TTHMs'],
        mode='markers',
        name=f'Cluster {cluster}',
        marker=dict(size=5), 
    ) for cluster in df['Cluster'].unique()
])

fig.update_layout(
    scene=dict(
        xaxis_title='Temperature (°C)',
        yaxis_title='Free Chlorine (mg/L)', 
        zaxis_title='TTHMs'
    ),
    showlegend=True
)

fig.show(
    width=800,
    height=600
)


In [None]:
# show the boxplot of the TTHMs for each cluster
import plotly.express as px
fig = px.box(df, x='Cluster', y='TTHMs', points='all')
fig.show()


# PCA

Principal Component Analysis is a technique that allows us to reduce the dimensionality of the dataset by projecting the data onto a lower-dimensional space. It can also be used as a model selection technique to select the most important features.

We are to use PCA in a supervised way, by trying to predict the TTHMs from the other variables.

We are going to compare three different approaches:
- Linear Regression with all the variables
- Linear Regression with 2 input variables based on expert knowledge (temperature and free chlorine)
- Linear Regression with the principal components

We are going to use the principal components as features for the Linear Regression.

Comparison is done with cross-validation.

In [373]:
# import the necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [None]:
# All the variables
df = grab_df[list(feature_mapping.values()) + thms_columns].copy()
df['TTHMs'] = df[thms_columns].sum(axis=1)
df = df[list(feature_mapping.values()) + ['TTHMs']]

# scale the data
scaler = MinMaxScaler()
df[list(feature_mapping.values())] = pd.DataFrame(scaler.fit_transform(df[list(feature_mapping.values())]), columns=list(feature_mapping.values()))

# perform Linear Regression with all the variables with cross-validation
lr = LinearRegression()
scores = cross_val_score(lr, df[list(feature_mapping.values())], df['TTHMs'], cv=5, scoring='neg_mean_squared_error')
print("Validation MSE: ", -scores.mean())



In [None]:
# perform Linear Regression with 2 input variables based on expert knowledge (temperature and free chlorine)
lr = LinearRegression()
scores = cross_val_score(lr, df[['Temperature (°C)', 'Free Chlorine (mg/L)']], df['TTHMs'], cv=5, scoring='neg_mean_squared_error')
print("Validation MSE: ", -scores.mean())

In [None]:
# first plot the cumulative explained variance ratio
from sklearn.decomposition import PCA

df = grab_df[list(feature_mapping.values()) + thms_columns].copy()
df['TTHMs'] = df[thms_columns].sum(axis=1)
df = df[list(feature_mapping.values()) + ['TTHMs']]

# scale the data
scaler = MinMaxScaler()
df[list(feature_mapping.values())] = pd.DataFrame(scaler.fit_transform(df[list(feature_mapping.values())]), columns=list(feature_mapping.values()))

pca = PCA(n_components=len(list(feature_mapping.values())))
pca.fit(df[list(feature_mapping.values())])

plt.plot(range(1, len(list(feature_mapping.values())) + 1), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

if we take 5 components, we explain more than 90% of the variance.

In [None]:
n_components = 5

df = grab_df[list(feature_mapping.values()) + thms_columns].copy()
df['TTHMs'] = df[thms_columns].sum(axis=1)
df = df[list(feature_mapping.values()) + ['TTHMs']]

# scale the data
scaler = MinMaxScaler()
df[list(feature_mapping.values())] = pd.DataFrame(scaler.fit_transform(df[list(feature_mapping.values())]), columns=list(feature_mapping.values()))

pca = PCA(n_components=n_components)
pca.fit(df[list(feature_mapping.values())])

df_pca = pca.fit_transform(df[list(feature_mapping.values())])
df_pca = pd.DataFrame(df_pca, columns=[f'PC{i}' for i in range(1, n_components + 1)])
df = pd.concat([df_pca, df['TTHMs']], axis=1)


# perform Linear Regression with the principal components with cross-validation
lr = LinearRegression()
scores = cross_val_score(lr, df[[f'PC{i}' for i in range(1, n_components + 1)]], df['TTHMs'], cv=5, scoring='neg_mean_squared_error')
print("Validation MSE: ", -scores.mean())