### Autoencoder 

This notebook contains the intital run of the autoencoder to extract features from raw time series data and clustering performed using various techniques such as Agglomerative clustering,KMeans and Gaussian Mixture Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from numpy.random import seed
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model

In [None]:
un_pos = pd.read_csv("/Users/neethug/Desktop/Neethu/Course/DATA599/Project/finaldata/Windowed Time Series/un_cal.csv")
ECD_pos = pd.read_csv("/Users/neethug/Desktop/Neethu/Course/DATA599/Project/finaldata/Windowed Time Series/ECD_cal.csv")
#syn_pos = pd.read_csv("/Users/neethug/Desktop/Neethu/Course/DATA599/Project/finaldata/Windowed Time Series/syn_post.csv")

In [None]:
un_pos['Label'] = "Unsuccessful"

In [None]:
ECD_pos['Label'] ="ECDcontacts"

In [None]:
data = pd.concat([un_pos,ECD_pos])

In [None]:
data

In [None]:
data = data.reset_index()

In [None]:
data.drop('index', axis=1, inplace=True)

In [None]:
### Autoencoders for dimensionality reduction

DropECDg TestID and Label column from the dataset

In [None]:
X = data.iloc[:,1:-1]

In [None]:
Y = data.iloc[:,-1]

Printing the dimensions

In [None]:
print('Data shape', X.shape)

Scaling the data for Neural network

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle = True)

# Desiging an autoencoder

Auto Encoders are is a type of artificial neural network used to learn efficient data patterns in an unsupervised manner. An Auto Encoder ideally consists of an encoder and decoder.

In [None]:
##defining the number of features

n_col = X.shape[1]

In [None]:
## defining encoding dimensions

encoding_dim = 50

In [None]:
input_dim = Input(shape = (n_col, ))

# Encoder Layers
encoded1 = Dense(3000, activation = 'relu')(input_dim)
encoded2 = Dense(2750, activation = 'relu')(encoded1)
encoded3 = Dense(2500, activation = 'relu')(encoded2)
encoded4 = Dense(2250, activation = 'relu')(encoded3)
encoded5 = Dense(2000, activation = 'relu')(encoded4)
encoded6 = Dense(1750, activation = 'relu')(encoded5)
encoded7 = Dense(1500, activation = 'relu')(encoded6)
encoded8 = Dense(1250, activation = 'relu')(encoded7)
encoded9 = Dense(1000, activation = 'relu')(encoded8)
encoded10 = Dense(750, activation = 'relu')(encoded9)
encoded11 = Dense(500, activation = 'relu')(encoded10)
encoded12 = Dense(250, activation = 'relu')(encoded11)
encoded13 = Dense(encoding_dim, activation = 'relu')(encoded12)

# Decoder Layers
decoded1 = Dense(250, activation = 'relu')(encoded13)
decoded2 = Dense(500, activation = 'relu')(decoded1)
decoded3 = Dense(750, activation = 'relu')(decoded2)
decoded4 = Dense(1000, activation = 'relu')(decoded3)
decoded5 = Dense(1250, activation = 'relu')(decoded4)
decoded6 = Dense(1500, activation = 'relu')(decoded5)
decoded7 = Dense(1750, activation = 'relu')(decoded6)
decoded8 = Dense(2000, activation = 'relu')(decoded7)
decoded9 = Dense(2250, activation = 'relu')(decoded8)
decoded10 = Dense(2500, activation = 'relu')(decoded9)
decoded11 = Dense(2750, activation = 'relu')(decoded10)
decoded12 = Dense(3000, activation = 'relu')(decoded11)
decoded13 = Dense(n_col, activation = 'sigmoid')(decoded12)

In [None]:
# Combine Encoder and Deocder layers
autoencoder = Model(inputs = input_dim, outputs = decoded13)

In [None]:
# Compile the Model
autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')

In [None]:
autoencoder.summary()

In [None]:
autoencoder.fit(X_train, X_train, epochs = 10, batch_size = 32, shuffle = False, validation_data = (X_test, X_test))

In [None]:
encoder = Model(inputs = input_dim, outputs = encoded13)
encoded_input = Input(shape = (encoding_dim, ))

In [None]:
encoded_train = pd.DataFrame(encoder.predict(X))
encoded_train = encoded_train.add_prefix('feature_')


In [None]:
encoded_train['target'] = Y

In [None]:
encoded_train

In [None]:
encoded_train.isnull().sum(axis=1).sum()

In [None]:
(encoded_train == 0).all()

In [None]:
data_without_zero_features = encoded_train.loc[:, (encoded_train != 0).any(axis=0)]

In [None]:
data_without_zero_features

# Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

In [None]:
plt.figure(figsize =(6, 6)) 
plt.title('Visualising the data') 
Dendrogram = shc.dendrogram((shc.linkage(data_without_zero_features.iloc[:,:-1], method ='ward')))

In [None]:
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
y = cluster.fit_predict(data_without_zero_features.iloc[:,:-1])

In [None]:
data_without_zero_features['cluster'] = y

In [None]:
data_without_zero_features[data_without_zero_features['cluster'] == 0]['target'].value_counts()

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()

alt.Chart(data_without_zero_features).mark_circle(size=60).encode(
    x='feature_0',
    y='feature_1',
    color ="target"
)

## Applying ECDA on these extracted features 

In [None]:
X = data_without_zero_features.iloc[:,:-1]

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_std = sc.fit_transform(X)

In [None]:
X_std

In [None]:
from sklearn.decomposition import ECDA
from sklearn.preprocessing import scale


ECDa = ECDA().fit(X)
plt.plot(np.cumsum(ECDa.explained_variance_ratio_))
plt.xlim(0,20,1)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')

In [None]:
ECDa = ECDA(n_components=0.95)
ECD = ECDa.fit_transform(X)

In [None]:
component_names = [f"ECD{i+1}" for i in range(ECD.shape[1])]

In [None]:
newdata = pd.DataFrame(ECD, columns=component_names)

In [None]:
newdata

In [None]:
newdata['Label'] = Y

In [None]:
alt.Chart(newdata).mark_circle(size=60).encode(
    x='ECD1',
    y='ECD2',
    color ="Label"
)

In [None]:
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
y = cluster.fit_predict(newdata.iloc[:,:-1])

In [None]:
newdata['cluster'] = y

In [None]:
newdata[newdata['cluster'] == 0]['Label'].value_counts()

In [None]:
from sklearn.cluster import KMeans

kmeans_test = KMeans(n_clusters= 2, init="k-means++", max_iter=500, algorithm = 'auto')
fitted = kmeans_test.fit(newdata.iloc[:,:-2])
prediction = kmeans_test.predict(newdata.iloc[:,:-2])

In [None]:
newdata['kcluster'] = prediction

In [None]:
newdata[newdata['kcluster'] == 0]['Label'].value_counts()

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
gmm = GaussianMixture(n_components=2).fit(newdata.iloc[:,:-3])

In [None]:
labels = gmm.predict(newdata.iloc[:,:-3])

In [None]:
newdata['gcluster'] = labels

In [None]:
newdata[newdata['gcluster'] == 1]['Label'].value_counts()

In [None]:
ECD_ts = pd.read_csv('/Users/neethug/Desktop/Neethu/Course/DATA599/Project/finaldata/TimeSeries/ECD_TS.csv')
un_ts = pd.read_csv('/Users/neethug/Desktop/Neethu/Course/DATA599/Project/finaldata/TimeSeries/US_TS.csv')

In [None]:
ECD_ts['Label']= "ECDcontact"

In [None]:
un_ts['Label'] = 'Unsuccessful'

In [None]:
data1 = pd.concat([ECD_ts,un_ts])

In [None]:
caldata = data1.iloc[:,:1005]

In [None]:
caldata['Label'] = data1['Label']

In [None]:
caldata = caldata.dropna()
caldata = caldata.reset_index(drop=True)

In [None]:
caldata

In [None]:
X = caldata.iloc[:,1:-1]
Y = caldata.iloc[:,-1]

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_std = sc.fit_transform(X)

In [None]:
a = minmax_scale(X, axis = 0)

In [None]:
a = pd.DataFrame(a)

In [None]:
a