# Unsupervised Learning
## K-Means Clustering

#### 1. Importing the Libraries

In [None]:
# Importing the libraries

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#### 2. Loading the Data

This dataset contains the physical characteristics of rocks to facilitate the predictions of lithothology from well logging measurments and was used for a Machine Learning competition hosted by FORCE 2020 and XEEK.

In [None]:
# Reading the data
data = pd.read_csv('UL_Features.csv')

# dropping some columns for our convinience
df = data.drop(['Unnamed: 0', 'wellName', 'MD', 'CALI', 'RACEHM_l10', 'RPCEHM_l10', 'RACELM_l10', 'RPCELM_l10', 'VSH'], axis = 1)

# dropping null values (if any)
df.dropna(inplace = True)

In [None]:
df # Lets see how our dataset looks like now

In [None]:
df.describe() # This gives a quantitative description of the dataset

#### 3. Transforming the Data

Standardise the data by using the StandardScalar function from sklearn.

To account for variations in measurements and units, it is a common practise to standardise the data before applying the Machine learning model. 

The function used to do this is ad follows.

$$z = \frac{x_i - \mu}{\sigma}$$

where,

$x_i$ is each individual value (i.e., $72.7456$ in $1^{st}$ row of 1st column viewed by df)

$\mu$ is the mean of that column (i.e., $39.781249$ in the $2^{nd}$ row of the $1^{st}$ column which can be viewed by df.describe())

$\sigma$ is the standard deviation (i.e., $18.378084$ in the $3^{rd}$ row of the $1^{st}$ column which can be viewed by df.describe())

In [None]:
scaler = StandardScaler() # This function automatically standardises each data point according to the formula given above

In [None]:
# We can add new columns giving standardised values of each data point

df[['GR_T', 'RHOB_T', 'NPHI_T', 'PHIF_T', 'SW_T']] = scaler.fit_transform(df[['GR', 'RHOB', 'NPHI', 'PHIF', 'SW']])

In [None]:
df # Viewing the data

### 4. Clustering the Data
#### Identifying the Optimum Number of Clusters: The Elbow Method

There are many different ways to find the optimum number of clusters to divide the data point into. The method we are going to use is called the Elbow Method. 

It plots the inertia which is a measure of how well the data is clustered by the K-Means algorithm, against the number of clusters. We are looking for a point where the inertia begins to slow down. 

In [None]:
# Creating a function to find the optimum number of clusters

def optimise_k_means(data, max_k):
    
    k_values = []
    inertias = []
    
    for k in range(1, max_k):
        kmeans = KMeans(n_clusters = k)
        kmeans.fit(data)
        
        k_values.append(k)
        inertias.append(kmeans.inertia_)
        
    # Generating Elbow plot
    
    fig = plt.subplots (figsize = (10, 5)) # Setting a suitable size of graph. Can be experimented with.
    plt.plot(k_values, inertias, 'o-') # This is the main line of code to plot the graph, rest are just decorations
    plt.xlabel('Number of Clusters') # Labelling the x axis
    plt.ylabel('Inertia') # Labelling the y axis
    plt.grid(True) # Introducing the grid
    plt.show()

In [None]:
optimise_k_means(df[['RHOB_T', 'NPHI_T']], 10) # Note that we are using the data transformed by SatandardScaler

In [None]:
# optimise_k_means(df[['PHIF_T', 'GR_T']], 10) K = 4

#### 5. Applying K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters = 3) # Setting k=3
kmeans.fit(df[['RHOB_T', 'NPHI_T']]) # Fitting the data of two columns into the algorithm
df['kmeans_3'] = kmeans.labels_ # Creating a new column 

In [None]:
df

#### 6. Plotting the Results

In [None]:
plt.scatter(x = df['NPHI'], y = df['RHOB'], c = df['kmeans_3'])
plt.xlim(-0.1, 0.5)
plt.ylim(3, 2.0)
plt.show()

#### 7. Creating Multiple Clusters

In [None]:
for k in range(1, 6):
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(df[['NPHI_T', 'RHOB_T']])
    df[f'KMeans_{k}'] = kmeans.labels_

In [None]:
fig, axs = plt.subplots (nrows = 1, ncols = 5, figsize = (20, 5))

for i, ax in enumerate(fig.axes, start = 1):
    ax.scatter(x = df['NPHI'], y = df['RHOB'], c = df[f'KMeans_{i}'])
    ax.set_ylim(3, 2)
    ax.set_xlim(-0.1, 0.5)
    ax.set_title(f'N Cluster: {i}')