In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from ripser import ripser
from persim import plot_diagrams
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn import mixture
from gtda.time_series import TakensEmbedding
from PyEMD import EMD
from statsmodels.tsa.stattools import adfuller
from pylab import mpl
from sklearn import preprocessing
from scipy.io import arff
import scipy
import sklearn
import torch
from tqdm import tqdm
from gtda.time_series import TakensEmbedding
from gtda.diagrams import BettiCurve

%matplotlib qt

In [2]:
filepath = "../dataset/data_akbilgic.csv"
data = pd.read_csv(filepath)
data = data.iloc[:, 1:]
data = data.to_numpy()
data = data.transpose()

X = np.zeros(10)
Y = np.zeros(1)

for i in range(data.shape[0]):
    line = data[i, :]
    for j in range(line.shape[0]-10):
        if np.any(line[j:j+11] == 0):
            continue
        else:
            X = np.append(X, line[j:j+10])
            Y = np.append(Y, line[j+10])

X = X.reshape(-1, 10)
Y = Y.reshape(-1, 1)

X = np.delete(X, 0, axis=0)
Y = np.delete(Y, 0, axis=0)

f = np.frompyfunc(lambda x:1 if x>0 else 0, 1, 1)

Y = f(Y)
Y = Y.reshape(-1)

X = sklearn.preprocessing.scale(X, axis=1)

X = X.astype(float)
Y = Y.astype(float)

print(X, X.shape)
print(Y, Y.shape)

[[ 1.31414157  0.98293484 -0.75803213 ... -1.1526735   0.18877102
   0.87426702]
 [ 1.28966392 -0.62876061 -1.80715955 ...  0.41455061  1.16991954
  -0.41075008]
 [-0.54314776 -1.84812     0.97218031 ...  1.44873086 -0.30172045
   0.04984904]
 ...
 [-0.14901389 -0.16925385 -0.31443043 ...  0.07184478  0.63446649
   0.38265176]
 [-0.28183117 -0.41974078 -1.5856026  ...  0.48165807  0.24244821
   0.94789071]
 [-0.45027235 -1.62127262 -1.68265706 ...  0.21483519  0.92338687
  -0.02493894]] (3838, 10)
[0. 0. 1. ... 1. 0. 0.] (3838,)


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
print(X_train)
print(Y_train)

[[ 0.48611785 -1.34224483 -1.60439486 ... -0.87451141  0.82018893
  -0.67706028]
 [ 0.2858818  -1.49271506 -1.07783688 ... -0.22550762  1.31583918
   1.40890194]
 [-1.56019078  1.6810084  -0.11390562 ... -0.58669957  0.38397629
   0.10598511]
 ...
 [ 1.45388763 -0.32291228 -2.35062325 ...  0.70221519 -0.20908321
   0.70952613]
 [-0.21742435  1.65280399 -0.05275826 ... -0.84851823 -1.89938468
   0.38584169]
 [ 2.46643456  1.16124943 -0.36838254 ...  0.14220902 -0.75339036
  -0.62329881]]
[1. 0. 0. ... 0. 0. 0.]


In [4]:
# DL part

class VanillaRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layer=2, batch_first=True):
        super(VanillaRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layer = num_layer
        self.batch_first = batch_first

        self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, \
            num_layers=num_layer, batch_first=batch_first)
        self.mlp = torch.nn.Linear(in_features=hidden_size, out_features=1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        # x: [batch_size, seq_len, input_size=1] need check
        x = x.unsqueeze(2)    
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.mlp(x)
        x = x.view(-1, 1)
        x = self.sigmoid(x)
        return x

In [None]:
input_size = 1
seq_len = 10
batch_size = 50
hidden_size = 32
epochs = 180
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = torch.utils.data.DataLoader(
    dataset = torch.utils.data.TensorDataset(torch.Tensor(X_train).to(device), torch.Tensor(Y_train).to(device)),
    batch_size = batch_size,
    shuffle = True
)

model = VanillaRNN(input_size=input_size, hidden_size=hidden_size)
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.to(device)

model.train()
for i in range(epochs):
    for seq, labels in train_loader:
        optimizer.zero_grad()
        y_pred = model(seq).squeeze()
        single_loss = loss_function(y_pred, labels)
        single_loss.backward()
        optimizer.step()
    print("Train Step:", i, " loss: ", single_loss)


In [11]:
model.eval()
test_loader = torch.utils.data.DataLoader(
    dataset = torch.utils.data.TensorDataset(torch.Tensor(X_test).to(device), torch.Tensor(Y_test).to(device)),
    batch_size = batch_size,
    shuffle = False
)

pred = np.zeros(batch_size)
true = np.zeros(batch_size)

for seq, labels in test_loader:
    y_pred = model(seq).squeeze()
    pred = np.append(pred, np.round(y_pred.cpu().detach().numpy()))
    true = np.append(true, labels.cpu().detach().numpy())

print(accuracy_score(true, pred))
print(precision_score(true, pred))
print(recall_score(true, pred))
print(f1_score(true, pred))
print(sklearn.metrics.confusion_matrix(y_pred=pred, y_true=true))

0.6099009900990099
0.6312625250501002
0.6
0.615234375
[[301 184]
 [210 315]]


In [4]:
te = TakensEmbedding(time_delay=1, dimension=5)
sample_train_tda = te.fit_transform(X_train)
sample_test_tda = te.fit_transform(X_test)

feature_train_list = []
feature_test_list = []

for i in tqdm(range(sample_train_tda.shape[0])):
    r = ripser(sample_train_tda[i, :, :])
    feature = np.delete(r['dgms'][0], -1, axis=0)[:, 1]
    feature2 = r['dgms'][1]
    l = []
    for i in range(feature2.shape[0]):
        l.append(feature2[i][1]-feature2[i][0])
    if feature2.shape[0]==0:
        f_min = 0
        f_max = 0
        f_2ave = 0
        f_2std = 0
    else:
        f_min = min(l)
        f_max = max(l)
        f_2ave = np.mean(feature2.flatten())
        f_2std = np.std(feature2.flatten())
    f = [feature.shape[0], np.sum(feature), np.mean(feature), np.std(feature), np.max(feature), np.min(feature), len([_ for _ in feature if _>0.5*np.max(feature)]), len([_ for _ in feature if _>np.mean(feature)]), f_min, f_max, f_2ave, f_2std]
    f = [np.round(_, 2) for _ in f]
    feature_train_list.append(np.array(f))

for i in tqdm(range(sample_test_tda.shape[0])):
    r = ripser(sample_test_tda[i, :, :])
    feature = np.delete(r['dgms'][0], -1, axis=0)[:, 1]
    f = [feature.shape[0], np.sum(feature), np.mean(feature), np.std(feature), np.max(feature), np.min(feature), len([_ for _ in feature if _>0.5*np.max(feature)]), len([_ for _ in feature if _>np.mean(feature)]), f_min, f_max, f_2ave, f_2std]
    f = [np.round(_, 2) for _ in f]
    feature_test_list.append(np.array(f))

100%|██████████| 2878/2878 [00:01<00:00, 2408.39it/s]
100%|██████████| 960/960 [00:00<00:00, 2774.49it/s]


In [13]:
# for i in range(20, 50):
#     r = ripser(sample_train_tda[i, :, :])
#     feature2 = r['dgms'][1]
#     print(feature2)
#     print(Y_train[i])

i = 10
r = ripser(sample_train_tda[i, :, :])
feature2 = r['dgms'][1]
print(feature2)
print(Y_train[i])
l = []
for j in range(feature2.shape[0]):
    l.append(feature2[j][1]-feature2[j][0])
if feature2.shape[0]==0:
    f_min = 0
    f_max = 0
    f_2ave = 0
    f_2std = 0
else:
    f_min = min(l)
    f_max = max(l)
    f_2ave = np.mean(feature2.flatten())
    f_2std = np.std(feature2.flatten())

[]
0.0


In [309]:
for i in range(20, 40):
    print(ripser(sample_train_tda[i, :, :])['dgms'][0])
    print(Y_train[i])

[[0.         0.07811994]
 [0.         0.07915831]
 [0.         0.08569729]
 [0.         0.0934767 ]
 [0.         0.10649204]
 [0.                inf]]
1.0
[[0.         0.01674768]
 [0.         0.02128126]
 [0.         0.02933499]
 [0.         0.0295048 ]
 [0.         0.03006888]
 [0.                inf]]
0.0
[[0.         0.03405164]
 [0.         0.0353781 ]
 [0.         0.03558036]
 [0.         0.0360937 ]
 [0.         0.03831566]
 [0.                inf]]
1.0
[[0.         0.02493963]
 [0.         0.03632173]
 [0.         0.03670402]
 [0.         0.03862363]
 [0.         0.04017391]
 [0.                inf]]
1.0
[[0.         0.01344275]
 [0.         0.01570959]
 [0.         0.01663523]
 [0.         0.01890967]
 [0.         0.03240649]
 [0.                inf]]
1.0
[[0.         0.0107632 ]
 [0.         0.01768062]
 [0.         0.01897987]
 [0.         0.01930363]
 [0.         0.02363755]
 [0.                inf]]
1.0
[[0.         0.02328856]
 [0.         0.02670768]
 [0.         0.02882

In [5]:
feature_train_pd = pd.DataFrame(np.concatenate((np.array(feature_train_list), Y_train.reshape(-1, 1)), axis=1), \
    columns=['Number of TDA barcode points', 'Sum of lifetime', 'Average of lifetime', 'Std of lifetime', 'Max of lifetime', \
        'Min of lifetime', 'Number of points bigger than 0.5*max', 'Number of points bigger than average', 'fmin', 'fmax', 'f2ave', 'f2std', 'Class'])

feature_test_pd = pd.DataFrame(np.concatenate((np.array(feature_test_list), Y_test.reshape(-1, 1)), axis=1), \
    columns=['Number of TDA barcode points', 'Sum of lifetime', 'Average of lifetime', 'Std of lifetime', 'Max of lifetime', \
        'Min of lifetime', 'Number of points bigger than 0.5*max', 'Number of points bigger than average', 'fmin', 'fmax', 'f2ave', 'f2std', 'Class'])

# print(feature_train_pd.corr())

In [6]:
X_train = feature_train_pd[['Number of TDA barcode points', 'Sum of lifetime', 'Average of lifetime', 'Std of lifetime', 'Max of lifetime', \
        'Min of lifetime', 'Number of points bigger than 0.5*max', 'Number of points bigger than average', 'fmin', 'fmax', 'f2ave', 'f2std']]
# X_train = feature_train_pd[['Sum of lifetime', 'Average of lifetime', 'Std of lifetime', 'Max of lifetime', 'fmin', 'fmax']]
Y_train = feature_train_pd[['Class']]

X_test = feature_test_pd[['Number of TDA barcode points', 'Sum of lifetime', 'Average of lifetime', 'Std of lifetime', 'Max of lifetime', \
        'Min of lifetime', 'Number of points bigger than 0.5*max', 'Number of points bigger than average', 'fmin', 'fmax', 'f2ave', 'f2std']]
# X_test = feature_test_pd[['Sum of lifetime', 'Average of lifetime', 'Std of lifetime', 'Max of lifetime', 'fmin', 'fmax']]
Y_test = feature_test_pd[['Class']]

In [26]:
clf = SVC(C=0.1, kernel='poly')
clf.fit(X_train, np.ravel(Y_train))

Y_pred = clf.predict(X_test)
print(accuracy_score(y_pred=Y_pred, y_true=Y_test))
print(precision_score(y_pred=Y_pred, y_true=Y_test))
print(recall_score(y_pred=Y_pred, y_true=Y_test))
print(f1_score(y_pred=Y_pred, y_true=Y_test))

print(sklearn.metrics.confusion_matrix(y_pred=Y_pred, y_true=Y_test))

0.5364583333333334
0.5364583333333334
1.0
0.6983050847457627
[[  0 445]
 [  0 515]]


In [315]:
clf = mixture.GaussianMixture(n_components=2)
clf.fit(X_train)

Y_pred = clf.predict(X_test)
print(accuracy_score(y_pred=Y_pred, y_true=Y_test))
print(precision_score(y_pred=Y_pred, y_true=Y_test))
print(recall_score(y_pred=Y_pred, y_true=Y_test))
print(f1_score(y_pred=Y_pred, y_true=Y_test))

print(sklearn.metrics.confusion_matrix(y_pred=Y_pred, y_true=Y_test))

0.5614583333333333
0.5614583333333333
1.0
0.7191460973982654
[[  0 421]
 [  0 539]]


In [316]:
X_np = X_train.to_numpy()
Y_np = Y_train.to_numpy()
zeros = np.where(Y_np == 0)[0]
ones = np.where(Y_np == 1)[0]

pca = PCA(n_components=2)

plt.scatter(pca.fit_transform(X_np[zeros])[:, 0], pca.fit_transform(X_np[zeros])[:, 1] ,color='red')
plt.scatter(pca.fit_transform(X_np[ones])[:, 0], pca.fit_transform(X_np[ones])[:, 1], color='blue')

<matplotlib.collections.PathCollection at 0x21e9faec3c8>

In [263]:
Y_pred = clf.predict(X_test)
print(accuracy_score(y_pred=Y_pred, y_true=Y_test))
print(precision_score(y_pred=Y_pred, y_true=Y_test))
print(recall_score(y_pred=Y_pred, y_true=Y_test))
print(f1_score(y_pred=Y_pred, y_true=Y_test))

print(sklearn.metrics.confusion_matrix(y_pred=Y_pred, y_true=Y_test))

0.5479166666666667
0.5479166666666667
1.0
0.7079407806191118
[[  0 434]
 [  0 526]]


In [23]:
class VanillaCLF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=True):
        super(VanillaCLF, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_first = batch_first
        self.mlp1 = torch.nn.Linear(in_features=input_size, out_features=hidden_size)
        self.mlp2 = torch.nn.Linear(in_features=hidden_size, out_features=1)
        self.ReLU = torch.nn.ReLU()
        self.Sigm = torch.nn.Sigmoid()

    def forward(self, x):
        # x: [batch_size, seq_len, input_size=1] need check   
        x = self.mlp1(x)
        x = self.Sigm(x)
        x = self.mlp2(x)
        
        return self.Sigm(x)

In [None]:
print(torch.Tensor(X_train.to_numpy()).to(device))
print(torch.Tensor(Y_train.to_numpy()).to(device))

In [24]:
input_size = 12
batch_size = 50
hidden_size = 256
epochs = 180
# device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device="cpu"

train_loader = torch.utils.data.DataLoader(
    dataset = torch.utils.data.TensorDataset(torch.Tensor(X_train.to_numpy()).to(device), torch.Tensor(Y_train.to_numpy()).to(device)),
    batch_size = batch_size,
    shuffle = True
)

model = VanillaCLF(input_size=input_size, hidden_size=hidden_size)
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.to(device)

model.train()
for i in range(epochs):
    for seq, labels in train_loader:
        optimizer.zero_grad()
        y_pred = model(seq).squeeze()
        # print(y_pred, labels)
        single_loss = loss_function(y_pred, labels.squeeze())
        single_loss.backward()
        optimizer.step()
    print("Train Step:", i, " loss: ", single_loss)

Train Step: 0  loss:  tensor(0.7032, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 1  loss:  tensor(0.6940, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 2  loss:  tensor(0.6893, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 3  loss:  tensor(0.6850, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 4  loss:  tensor(0.6467, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 5  loss:  tensor(0.6728, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 6  loss:  tensor(0.6634, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 7  loss:  tensor(0.6995, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 8  loss:  tensor(0.6923, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 9  loss:  tensor(0.6998, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 10  loss:  tensor(0.6666, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 11  loss:  tensor(0.6969, grad_fn=<BinaryCrossEntropyBackward0>)
Train Step: 12  loss:  tensor(0.6673, grad_fn=<BinaryCrossEntropyBackward0>)
Train Ste

In [10]:
yt = Y_train.to_numpy()
print(np.where(yt>1))
print(np.where(yt<0))

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))
