## Description

This notebook tries to cluster frequency domain of the calibration window of the readings into 2 groups using agglomerative clustering with 3 clusters and with euclidean distance as a distance measure. This process was also repeated after trying to extract features from the frequency domain with TSFresh. ecd contacts were not separated from unsuccessful in either case. Plotting the first two components of a PCA further demonstrated the lack of separation between these two groups.  We also tried this on the post ad sample windows with similar results. 

In [None]:
import numpy as np
import pandas as pd
from tsfresh import extract_features, select_features
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

In [None]:
# These files can be generated by following the preprocessing steps outlined in the glossary. 
un_ts = pd.read_csv('Data/Windowed Time Series/un_cal.csv')
ecd_ts = pd.read_csv('Data/Windowed Time Series/ecd_cal.csv')
from sklearn.preprocessing import StandardScaler

In [None]:
# Scale to get the waveforms as int16 for fft. 
un_normalized = np.int16([((un_ts.drop('TestId', axis = 1).iloc[i,:] /un_ts.drop('TestId', axis = 1).iloc[i,:].max()) * 32767) for i in range(len(un_ts))])

In [None]:
from scipy.fft import rfft, rfftfreq

# Number of samples in normalized_tone
N = 50

un_yf = rfft(un_normalized)
xf = rfftfreq(N, 1 / 5)

for i in range(7000):
    plt.plot(xf, np.abs(un_yf[i,:]))
plt.plot()

In [None]:
# Scale to get the waveforms as int16 for fft.
ecd_normalized = np.int16([((ecd_ts.drop('TestId', axis = 1).iloc[i,:] /ecd_ts.drop('TestId', axis = 1).iloc[i,:].max()) * 32767) for i in range(len(ecd_ts))])

In [None]:
# Number of samples in normalized_tone
N = 50

ecd_yf = rfft(ecd_normalized)

for i in range(200):
    plt.plot(xf, np.abs(ecd_yf[i,:]))
plt.plot()

In [None]:
ecd_yf = np.abs(ecd_yf)
un_yf = np.abs(un_yf)

In [None]:
ecd_lab = pd.Series(['ecd'])
un_lab = pd.Series(['un'])
x = un_lab.repeat(len(un_ts))
y = ecd_lab.repeat(len(ecd_ts))
labs = pd.concat([x, y])

In [None]:
yf = pd.concat([pd.DataFrame(un_yf, columns = xf), pd.DataFrame(ecd_yf, columns = xf)]).reset_index(drop = True)

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(yf)
ecda = PCA(n_components=0.95)
principalComponents = pca.fit_transform(scaled_features)

In [None]:
pcadf = pd.DataFrame(data = principalComponents, columns = ['Component '+ str(i+1) for i in range(pca.n_components_)])

In [None]:
pcadf

In [None]:
cmap = {'ecd':"red", 'un':"blue"}
plt.scatter(pcadf['Component 1'], pcadf['Component 2'], c = labs.map(cmap))

In [None]:
linked = linkage(yf, 'ward')

labelList = range(len(yf))

plt.figure(figsize=(10, 7))
dendrogram(linked,
            orientation='top',
            labels=labelList,
            distance_sort='descending',
            show_leaf_counts=True)
plt.show()

In [None]:
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
res = cluster.fit_predict(pcadf)

In [None]:
def results(labels, clusters, ecd, un):
    print("There are", str(len(pd.Series(clusters).unique())), "clusters:", "\n")
    
    unl = clusters[labels == 'un']
    ecdl = clusters[labels == 'ecd']
    for i in range(len(pd.Series(clusters).unique())):
        print("Cluster",  str(i+1), ":")
        print("-----------------------------------")
        print(str(sum(ecdl == i)), "of", str(ecd), "ecd contacts", '\t', round(sum(ecdl == i)/ecd*100, 2), '%')
        print(str(sum(unl == i)), "of", str(un), "unsuccessful", '\t', round(sum(unl == i)/un*100, 2), '%')
        print("")

In [None]:
results(labs, res, len(ecd_ts), len(un_ts))

# What about feature extraction then clustering?

In [None]:
pd.concat([un_ts['TestId'], ecd_ts['TestId']]).reset_index(drop = True)

In [None]:
yf['TestId'] = pd.concat([un_ts['TestId'], ecd_ts['TestId']]).reset_index(drop = True)

In [None]:
yf

In [None]:
melt_data = pd.melt(yf, id_vars = 'TestId', var_name = 'time')
melt_data['time'] = pd.to_numeric(melt_data['time'])
melt_data = melt_data.sort_values(by = ['TestId', 'time']).reset_index(drop = True)

In [None]:
melt_data

In [None]:
# Now we want to extract features seperately for all ids
extracted_features = extract_features(melt_data, column_id="TestId", column_sort="time", column_value = 'value')
# Remove all features containing NaN values (which were created because could not be calculated on the time series, i.e. stat too low)
extracted_features.dropna(axis='columns', inplace = True)

# Select the relevant features
impute(extracted_features)

In [None]:
# Scale the feature matrix
scaler = StandardScaler()
scaled_features = scaler.fit_transform(extracted_features)

print(f"The total number of features created is: {len(extracted_features.columns)}")


#Convert to data frame with test ids as index and appropriate column labels
scaled_features  = pd.DataFrame(scaled_features, columns = extracted_features.columns).set_index(extracted_features.index)

In [None]:
pca = PCA(n_components=0.95)
principalComponents = pca.fit_transform(scaled_features)

In [None]:
pcadf = pd.DataFrame(data = principalComponents, columns = ['Component '+ str(i+1) for i in range(pca.n_components_)])

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components = 2).fit(pcadf)
clusters = gmm.predict(pcadf)
results(labs, clusters, len(ecd_ts), len(un_ts))