# Bearing Clustering Project

## IMPORTING LIBRARIES AND DATAFRAME

In [None]:
#Data analysis libraries
import numpy as np 
import pandas as pd 

#Visulization and statistics libraries
import matplotlib.pyplot as plt
from matplotlib.pyplot import style
from scipy import fftpack
import seaborn as sns
style.use('seaborn')

# Model related libraries
import statsmodels.api as sm
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.vq import kmeans, vq
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AffinityPropagation
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import Birch
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering


In [None]:
# Setting to display all columns
pd.set_option("Display.max_columns", None)

In [None]:
#To load the data 
target_set = pd.read_csv("bearing_classes.csv", sep=";", skipinitialspace=True)
origin_set = pd.read_csv("bearing_signals.csv", skipinitialspace=True)

In [None]:
origin_set.shape

In [None]:
target_set.shape

## EDA

In [None]:
origin_set.head()

In [None]:
target_set.head()

In [None]:
origin_set['experiment_id'].nunique

In [None]:
origin_set['bearing_2_id'].nunique

In [None]:
target_set['status'].value_counts()

In [None]:
origin_set.info()

### Functions (To Do)

In [None]:
# To rename the column name using function
"""This function takes dataframe & prefix of the columns.
   It needs name of columns from the dataframe and add prefix before the each name of the columns.
   It returns dataframe with new column names."""
def rename_column(df,prefix):
    column_name = list(df.columns)
    column_name = [prefix + name for name in column_name]
    return df.set_axis(column_name, axis=1)

In [None]:
# To find a frequency with the function
"""This function takes dataframe and number of bearing id.
   First create an empty list and apply a for loop with range of length of dataframe.
   Make a group as per bearing_2_id of dataframe.
   Find the frequency using the fftpack module of scipy library.
   Convert this frequency into absulate amplitude values.
   Function returns a list of all amplitude values of the freatures."""
def by_axis_bearing(bearing_feature, i):
    max_list = []
    for index in range(len(bearing_feature)):
        bearing_idx = bearing_feature[index]
        df_bearing = bearing_idx[bearing_idx['bearing_2_id'] == i]
        fft_values = fftpack.fft(df_bearing)
        max_amplitude = np.argmax(np.abs(fft_values))
        max_list.append(max_amplitude)
    return max_list

## TO DO

In [None]:
# create new columns with w, rpm, hz

def extract_basic_feats(df):
    list_of_feats = ['rpm','hz','w','timestamp']
    aggregations = ['min', 'mean', 'max']
    for feat in list_of_feats :
        for aggs in aggregations :
            df[feat] = origin_set.groupby(['experiment_id','bearing_2_id'])[feat].transform(aggs)
    return df

extract_basic_feats(df_train)


## EDA AND FEATURE ENGINEERING

In [None]:
# To drop some columns from the dataframe
df_train = origin_set.drop(['experiment_id','bearing_1_id'], axis=1)
df_good = origin_set.drop(['experiment_id', 'bearing_2_id'], axis = 1)

In [None]:
# To find maximun, minimum, mean, standerd deviation, median, range, kurtosis and skewness
# of the data and apply function for rename the column names for each features
max_set = df_train.groupby(['bearing_2_id']).max()
max_set = rename_column(max_set,"max_")
min_set = df_train.groupby(['bearing_2_id']).min()
min_set = rename_column(min_set,"min_")
mean_set = df_train.groupby(['bearing_2_id']).mean()
mean_set = rename_column(mean_set,"mean_")
std_set = df_train.groupby(['bearing_2_id']).std()
std_set = rename_column(std_set,"std_")
median_set = df_train.groupby(['bearing_2_id']).median()
median_set = rename_column(median_set,"median_")
range_set = df_train.groupby(['bearing_2_id']).max() - df_train.groupby(['bearing_2_id']).min()
range_set = rename_column(range_set,"range_")
kurtosis_set = df_train.groupby(['bearing_2_id']).apply(pd.DataFrame.kurtosis)
kurtosis_set = rename_column(kurtosis_set, 'kurtosis_')
skew_set = df_train.groupby(['bearing_2_id']).skew()
skew_set = rename_column(skew_set, 'skew_')
var_set = df_train.groupby(['bearing_2_id']).var()
var_set = rename_column(var_set, 'var_')

In [None]:
# To make dataframe of individual features and make list of new dataframe and list of new column names
df_bearing_2_x = df_train.drop(['a1_x','a1_y','a1_z','a2_y','a2_z'], axis = 1)
df_bearing_2_y = df_train.drop(['a1_x','a1_y','a1_z','a2_x','a2_z'], axis = 1)
df_bearing_2_z = df_train.drop(['a1_x','a1_y','a1_z','a2_x','a2_y'], axis = 1)

bearing_feature = [df_bearing_2_x,df_bearing_2_y,df_bearing_2_z]
list_column = ['fft_a2_x','fft_a2_y','fft_a2_z']

In [None]:
# To find the maximum frequency of each accelerations of bearings and make a dataframe
number_bearing = df_train['bearing_2_id'].max()

max_list = by_axis_bearing(bearing_feature, 1)
new_set = pd.DataFrame([max_list], columns=list_column,index=[1])

for i in range(2,number_bearing+1):
    max_list = by_axis_bearing(bearing_feature,i)
    temp_set = pd.DataFrame([max_list],columns=list_column,index=[i])
    new_set = new_set.append(temp_set)

In [None]:
df = pd.concat([max_set, min_set, mean_set, std_set, median_set, range_set, kurtosis_set, skew_set, new_set, var_set], axis=1)
#df['target']= target_set.iloc[1:,1]

In [None]:
df.info()

In [None]:
df.columns

In [None]:
#To drop some features from the dataframe
df1 = df.drop(['max_timestamp','min_timestamp','mean_timestamp','std_timestamp','median_timestamp',
              'range_timestamp','kurtosis_bearing_2_id','kurtosis_timestamp','skew_timestamp','var_timestamp',
              'max_a1_x', 'max_a1_y', 'max_a1_z','min_a1_x', 'min_a1_y', 'min_a1_z','mean_a1_x',
       'mean_a1_y', 'mean_a1_z','std_a1_x','std_a1_y', 'std_a1_z','median_a1_x', 'median_a1_y',
       'median_a1_z','range_a1_x','range_a1_y', 'range_a1_z','kurtosis_a1_x', 'kurtosis_a1_y', 'kurtosis_a1_z',
               'skew_a1_x', 'skew_a1_y','skew_a1_z','var_a1_x', 'var_a1_y',
       'var_a1_z','min_rpm','max_rpm','max_hz','max_w','mean_rpm','median_hz','median_w','range_rpm','range_hz',
               'range_w','kurtosis_rpm','kurtosis_hz','kurtosis_w','skew_rpm','skew_hz','skew_w','var_rpm','var_hz',
               'min_hz', 'min_w','min_timestamp','var_w', 'std_rpm', 'std_hz', 'std_w', 'mean_hz', 'mean_w','median_rpm'
              ], axis=1)





In [None]:
df1.columns

In [None]:
X = df1.copy()

In [None]:
X1 = df1.copy()

In [None]:
X2 = df1.copy()

In [None]:
X3 = df1.copy()

In [None]:
X.head()

### Normalizing & Standardizing Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df1_scaled = scaler.fit_transform(df1)

In [None]:
# Import the whiten function
from scipy.cluster.vq import whiten

# Use the whiten() function to standardize the data
df1_whiten = whiten(df1)
print(df1_whiten)

## PLOTS

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")
sns.pairplot(df1)

In [None]:
sns.pairplot(df1)
plt.title('Pairplot for the Data', fontsize = 20)
plt.show()

In [None]:
g = sns.pairplot(df1, hue='cluster_labels',height=1.5)
g= fig.suptitle("Co-relation between features", y=1.08)

## PCA

In [None]:
from sklearn.decomposition import PCA
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(df1_scaled)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))


print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca_2.explained_variance_ratio_)))



## ELBOW SCORE

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.style as style

range_n_clusters = [1, 2, 3, 4, 5, 6]
avg_distance=[]
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, random_state=42).fit(df1)
    avg_distance.append(clusterer.inertia_)

style.use("fivethirtyeight")
plt.plot(range_n_clusters, avg_distance)
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Distance")
plt.show()

### Scaled data Elbow plot

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.style as style

range_n_clusters = [1, 2, 3, 4, 5, 6]
avg_distance=[]
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, random_state=42).fit(df1_scaled)
    avg_distance.append(clusterer.inertia_)

style.use("fivethirtyeight")
plt.plot(range_n_clusters, avg_distance)
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Distance")
plt.show()

## Dendrogram

In [None]:
# Import the dendrogram function
from scipy.cluster.hierarchy import dendrogram

# Create a dendrogram
dn = dendrogram(distance_matrix)

# Display the dendogram
plt.show()

## Inertia Plots

In [None]:

# How to find the best number if Ks?

# Running K means with multible Ks

no_of_clusters = range(2,20) #[2,3,4,5,6,7,8,9]
inertia = []


for f in no_of_clusters:
    kmeans = KMeans(n_clusters=f, random_state=2)
    kmeans = kmeans.fit(df1_metrics)
    u = kmeans.inertia_
    inertia.append(u)
    print("The innertia for :", f, "Clusters is:", u)

In [None]:

# Creating the scree plot for Intertia - elbow method
fig, (ax1) = plt.subplots(1, figsize=(16,6))
xx = np.arange(len(no_of_clusters))
ax1.plot(xx, inertia)
ax1.set_xticks(xx)
ax1.set_xticklabels(no_of_clusters, rotation='vertical')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia Score')
plt.title("Inertia Plot per k")

## Silhouette Score 2 Features

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import matplotlib.style as style
import itertools

combi =list(itertools.combinations(df1.columns,2))
for feat1,feat2 in combi:
    X = df1[[feat1,feat2]].values
    range_n_clusters = [2, 3, 4, 5, 6]
    silhouette_avg_n_clusters = []

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        silhouette_avg_n_clusters.append(silhouette_avg)
        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                        s=50, edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel(feat1)
        ax2.set_ylabel(feat2)

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

    plt.show()


    style.use("fivethirtyeight")
    plt.plot(range_n_clusters, silhouette_avg_n_clusters)
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("silhouette score")
    plt.show()

### Silhouette Score with 3 Feats

In [None]:
df1.info()

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import matplotlib.style as style
import itertools


def score_combination(df):
    combinations = list(itertools.combinations(df1.columns, 3))

    score_dict = dict()
    for feature1,feature2,feature3 in combinations:
        X = df1[[feature1, feature2,feature3]]
        for number_of_clusters in range(2,5):
            km = KMeans(random_state=42,n_clusters=number_of_clusters)
            km.fit(X)
            sil_score= silhouette_score(X, km.labels_)
            score_dict[sil_score] = (feature1, feature2, number_of_clusters)
    score_dict_keys = list(score_dict.keys())
    score_dict_keys = sorted(score_dict_keys,reverse=True)
    return score_dict,score_dict_keys

def select_plot_score(df1,min_cluster,min_by_cluster,score_dict,score_dict_keys):
    features_list = []
    
    for key in score_dict_keys:
        nr_clusters = score_dict[key][2]
        if nr_clusters >= min_cluster:
            feature1 = score_dict[key][0]
            feature2 = score_dict[key][1]
            features_list.append(feature1)
            features_list.append(feature2)
            km = KMeans(random_state=42, n_clusters=nr_clusters)
            X = df1[[feature1,feature2]]
            km.fit(X)
            values,counts = np.unique(km.labels_, return_counts=True)
            if sorted(counts)[0]>min_by_cluster:
                sns.scatterplot(data=df1, x=feature1, y=feature2, hue=km.labels_)
                plt.show()

score_dict,score_dict_keys = score_combination(df1)

select_plot_score(df,3,30,score_dict,score_dict_keys)

In [None]:
sns.set(style = "darkgrid")

fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(111, projection='3d')

for cluster in df1.cluster_labels.unique():
    ax.scatter(df1.max_a2_x[df1.cluster_labels==cluster],df1.mean_a2_x[df1.cluster_labels==cluster],df1.fft_a2_x[df1.cluster_labels==cluster],label=cluster)

ax.set_ylabel("MEAN")
ax.set_xlabel("MAX ACC (hz)")
ax.set_zlabel("FFT") 
plt.title('Clustering MAX ACC, STD, FFT on X-Axis', fontsize = 20)
ax.legend()

In [None]:
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(111, projection='3d')

for cluster in df1.cluster_labels.unique():
    ax.scatter(df1.max_a2_x[df1.cluster_labels==cluster],df1.std_a2_x[df1.cluster_labels==cluster],df1.fft_a2_x[df1.cluster_labels==cluster],label=cluster)

ax.set_xlabel("MAX (hz)")
ax.set_ylabel("STD")
ax.set_zlabel("FFT") 
plt.title('Clustering MAX ACC, STD, FFT on X-Axis', fontsize = 20)
ax.legend()



## CLUSTERING TECHNIQUES

## K-MEANS

### Mean and Skew X-axis

In [None]:
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(df1[['mean_a2_x','skew_a2_x']],3)

# Assign cluster labels
df1['cluster_labels'], distortion_list = vq(df1[['mean_a2_x','skew_a2_x']], cluster_centers)

# Plot clusters
sns.scatterplot(x='mean_a2_x', y='skew_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()

### KMEANS - VAR AND RANGE X-AXIS

In [None]:
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(df1[['var_a2_x','range_a2_x']],3)

# Assign cluster labels
df1['cluster_labels'], distortion_list = vq(df1[['var_a2_x','range_a2_x']], cluster_centers)

# Plot clusters
sns.scatterplot(x='var_a2_x', y='range_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()

### KMEANS Max Acc & Mean Acc X-axis

In [None]:
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(df1[['mean_a2_x','max_a2_x']],3)

# Assign cluster labels
df1['cluster_labels'], distortion_list = vq(df1[['mean_a2_x','max_a2_x']], cluster_centers)

# Plot clusters
sns.scatterplot(x='mean_a2_x', y='max_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()

### Kmeans with mean & std X-axis 3 clusters

In [None]:
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(df1[['mean_a2_x','std_a2_x']],3)

# Assign cluster labels
df1['cluster_labels'], distortion_list = vq(df1[['mean_a2_x','std_a2_x']], cluster_centers)

# Plot clusters
sns.scatterplot(x='mean_a2_x', y='std_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()

### K-Means with Mean, Skew, FFT

In [None]:
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(df1[['mean_a2_x','skew_a2_x','fft_a2_x']],3)

# Assign cluster labels
df1['cluster_labels'], distortion_list = vq(df1[['mean_a2_x','skew_a2_x', 'fft_a2_x']], cluster_centers)

# Plot clusters
sns.scatterplot(x='mean_a2_x', y='skew_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)

y = kmeans.fit_predict(X[['mean_a2_x','skew_a2_x','fft_a2_x']])

X['Cluster'] = y

X.head()

In [None]:


inertia_list = []
for state in range(1, 100):
    for cluster in range(1,6):
    # fit a kmeans object to the dataset
        kmeans = KMeans(n_clusters=cluster, init='k-means++', random_state = state).fit(X2)
        inertia_list.append(kmeans.inertia_)

# clusters is an attribute of the object
        cluster_centers = kmeans.cluster_centers_

# add cluster index to dataframe
        cluster_labels = pd.Series(kmeans.labels_, name='cluster')
        X2 = X2.join(cluster_labels.to_frame())
    
print("Min inertia:", min(inertia_list), "at random_state = ", inertia_list.index(min(inertia_list)))
    

### WITH FITTING

In [None]:
from sklearn.cluster import KMeans

# make new dataframe with relevant metrics
df1_metrics = df1['mean_a2_x'].to_frame().join(df1['skew_a2_x'].to_frame()).join(df1['median_a2_x'].to_frame())

# z-score normalisation
df1_metrics_normalized=(df1_metrics-df1_metrics.mean())/df1_metrics.std()
df1_metrics_normalized = df1_metrics_normalized.rename(columns={'mean_a2_x': 'mean_zscore',
                                                                        'skew_a2_x':'skew_zscore',
                                                               'median_a2_x' : 'median_zscore'})

# fit a kmeans object to the dataset
kmeans = KMeans(n_clusters=2, init='k-means++').fit(df1_metrics_normalized)

# clusters is an attribute of the object
cluster_centers = kmeans.cluster_centers_

# add cluster index to dataframe
cluster_labels = pd.Series(kmeans.labels_, name='cluster')
df1_metrics_normalized = df1_metrics_normalized.join(cluster_labels.to_frame())

In [None]:
sns.lmplot(x='mean_zscore', y='skew_zscore',  height=10, data=df1_metrics_normalized,
           fit_reg=False, hue='cluster')

In [None]:

# make new dataframe with relevant metrics
df1_metrics = df1['range_a2_y'].to_frame().join(df1['var_a2_y'].to_frame())

# z-score normalisation
df1_metrics_normalized=(df1_metrics-df1_metrics.mean())/df1_metrics.std()
df1_metrics_normalized = df1_metrics_normalized.rename(columns={'range_a2_y': 'range_zscore',
                                                                        'var_a2_y':'var_zscore'})

# fit a kmeans object to the dataset
kmeans = KMeans(n_clusters=2, init='k-means++').fit(df1_metrics_normalized)

# clusters is an attribute of the object
cluster_centers = kmeans.cluster_centers_

# add cluster index to dataframe
cluster_labels = pd.Series(kmeans.labels_, name='cluster')
df1_metrics_normalized = df1_metrics_normalized.join(cluster_labels.to_frame())

In [None]:
sns.lmplot(x='range_zscore', y='var_zscore',  height=10, data=df1_metrics_normalized,
           fit_reg=False, hue='cluster')

### AFFINITY PROPAGATION

In [None]:
# affinity propagation clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AffinityPropagation
from matplotlib import pyplot
# define dataset
X2, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = AffinityPropagation(damping=0.9)
# fit the model
model.fit(X2)
# assign a cluster to each example
yhat = model.predict(X2)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
# show the plot
pyplot.show()

### Agglomerative Clustering

In [None]:
# agglomerative clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot
# define dataset
X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = AgglomerativeClustering(n_clusters=2)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

In [None]:

# Trying with Dimentionality reduction and then Kmeans

n_components = X.shape[1]

# Running PCA with all components
pca = PCA(n_components=n_components, random_state = 453)
X_r = pca.fit(X).transform(X)


# Calculating the 95% Variance
total_variance = sum(pca.explained_variance_)
print("Total Variance in our dataset is: ", total_variance)
var_95 = total_variance * 0.95
print("The 95% variance we want to have is: ", var_95)
print("")

# Creating a df with the components and explained variance
a = zip(range(0,n_components), pca.explained_variance_)
a = pd.DataFrame(a, columns=["PCA Comp", "Explained Variance"])

# Trying to hit 95%
print("Variance explain with 30 n_compononets: ", sum(a["Explained Variance"][0:30]))
print("Variance explain with 35 n_compononets: ", sum(a["Explained Variance"][0:35]))
print("Variance explain with 40 n_compononets: ", sum(a["Explained Variance"][0:40]))
print("Variance explain with 41 n_compononets: ", sum(a["Explained Variance"][0:41]))
print("Variance explain with 50 n_compononets: ", sum(a["Explained Variance"][0:50]))
print("Variance explain with 53 n_compononets: ", sum(a["Explained Variance"][0:53]))
print("Variance explain with 55 n_compononets: ", sum(a["Explained Variance"][0:55]))
print("Variance explain with 60 n_compononets: ", sum(a["Explained Variance"][0:60]))

# Plotting the Data
plt.figure(1, figsize=(34, 18))
plt.plot(pca.explained_variance_ratio_, linewidth=2, c="r")
plt.xlabel('n_components')
plt.ylabel('explained_ratio_')

# Plotting line with 95% e.v.
plt.axvline(53,linestyle=':', label='n_components - 95% explained', c ="blue")
plt.legend(prop=dict(size=12))

# adding arrow
plt.annotate('53 eigenvectors used to explain 95% variance', xy=(53, pca.explained_variance_ratio_[53]), 
             xytext=(58, pca.explained_variance_ratio_[10]),
            arrowprops=dict(facecolor='blue', shrink=0.05))

plt.show()

## BIRCH

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import Birch

In [None]:
df1, clusters = make_blobs(n_samples = 1000, centers = 12, cluster_std = 0.50, random_state = 0)
df1.shape

In [None]:
model = Birch(branching_factor = 50, n_clusters = 3, threshold = 1.5)


In [None]:
model.fit(df1)

In [None]:
pred = model.predict(df1)

In [None]:
plt.scatter(df1[:, 0], df1[:, 1], c = pred)

## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.4, min_samples=20)
db.fit(df1)

In [None]:
y_pred = db.fit_predict(df1)
plt.figure(figsize=(10,6))
plt.scatter(df1['mean_a2_x'], df1['skew_a2_x'],c=y_pred, cmap='Paired')
plt.title("Clusters determined by DBSCAN")

## HIERARCHICAL CLUSTERING

In [None]:
# Import the fcluster and linkage functions
from scipy.cluster.hierarchy import fcluster, linkage

# Use the linkage() function
distance_matrix = linkage(df1[['mean_a2_x', 'max_a2_x']], method = 'ward', metric = 'euclidean')

# Assign cluster labels
df1['cluster_labels'] = fcluster(distance_matrix, 3, criterion='maxclust')

# Plot clusters
sns.scatterplot(x='mean_a2_x', y='max_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()

In [None]:
df1.columns

In [None]:
# Import the fcluster and linkage functions
from scipy.cluster.hierarchy import fcluster, linkage

# Use the linkage() function
distance_matrix = linkage(df1[['max_a2_x', 'mean_a2_y']], method = 'ward', metric = 'euclidean')

# Assign cluster labels
df1['cluster_labels'] = fcluster(distance_matrix, 2, criterion='maxclust')

# Plot clusters
sns.scatterplot(x='max_a2_x', y='mean_a2_x', 
                hue='cluster_labels', data = df1)
plt.show()