In [33]:
# !pip install liac-arff
# !pip install scikit-multilearn
# !pip uninstall scikit-learn -y
# !pip install scikit-learn==0.24.1

In [34]:
from warnings import filterwarnings
filterwarnings('ignore')

In [35]:
import tqdm
import time
import random
import numpy as np
from scipy import spatial
from scipy.stats import pearsonr
from skmultilearn.dataset import load_from_arff

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance

In [36]:
X, y = load_from_arff("Emotions.arff", label_count = 6)
X = X.toarray()
y = y.toarray()

index = np.arange(len(X))
np.random.shuffle(index)
X, y = X[index], y[index]

print(X.shape, y.shape)

(593, 72) (593, 6)


In [37]:
print(X)
print(y)

[[0.056494 0.121974 0.076265 ... 0.223948 0.158557 0.527927]
 [0.029022 0.553287 0.094462 ... 0.427735 0.164804 0.970506]
 [0.081374 0.272747 0.085733 ... 0.343547 0.276366 0.710924]
 ...
 [0.036299 0.064986 0.082104 ... 0.095982 0.520006 0.677943]
 [0.146109 0.304348 0.096113 ... 0.557821 0.244548 0.802369]
 [0.069209 0.133291 0.075974 ... 0.394914 0.255537 1.223636]]
[[0 0 1 1 1 0]
 [0 0 1 0 1 0]
 [1 0 0 0 0 1]
 ...
 [0 0 1 1 1 0]
 [0 0 0 0 0 1]
 [0 0 1 1 1 0]]


In [38]:
train_samples = int(X.shape[0] * 0.6)

X_train = X[:train_samples, :]
y_train = y[:train_samples]
X_test = X[train_samples:, :]
y_test = y[train_samples:]

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

X_train: (355, 72) y_train: (355, 6)
X_test: (238, 72) y_test: (238, 6)


In [39]:
d = X_train.shape[1] 
l = y_train.shape[1]
print("d:", d, "l:", l)

d: 72 l: 6


In [40]:
def score(y, pred):
    pred = pred.toarray()
    score = 0
    for i in range(y.shape[0]):
        score += np.sum(y[i] * pred[i]) / (np.sum(y[i]) + np.sum(pred[i]) - np.sum(y[i] * pred[i]))
    score /= y.shape[0]
    return score

## Without FS, taking the whole dataset

In [41]:
X_train_subset = X_train[:, :]
X_test_subset = X_test[:, :]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

X_train_subset: (355, 72) y_train: (355, 6)
X_test_subset: (238, 72) y_test: (238, 6)
lr train accu: 0.539
lr test_accu: 0.4634
nb train accu: 0.5764
nb test_accu: 0.5083
knn train accu: 0.5526
knn test_accu: 0.382
avg train accu: 0.556
avg test_accu: 0.4512


In [42]:
beg = time.time()
fcorr = np.corrcoef(X_train.T)
end = time.time()
fcorr = np.absolute(fcorr)
print(fcorr.shape)
print(fcorr)

(72, 72)
[[1.         0.60523737 0.15438141 ... 0.36558786 0.42313713 0.42783413]
 [0.60523737 1.         0.54975912 ... 0.49352091 0.52088132 0.51621682]
 [0.15438141 0.54975912 1.         ... 0.33977035 0.32297097 0.31709236]
 ...
 [0.36558786 0.49352091 0.33977035 ... 1.         0.65495891 0.90126544]
 [0.42313713 0.52088132 0.32297097 ... 0.65495891 1.         0.83343408]
 [0.42783413 0.51621682 0.31709236 ... 0.90126544 0.83343408 1.        ]]


In [43]:
flcorr = np.zeros((d, l))
for i in tqdm.tqdm(range(d)):
    for j in range(l):
        flcorr[i][j] = pearsonr(X_train[:, i], y_train[:, j])[0] + 0.001
flcorr = np.absolute(flcorr)
flcorr = np.max(flcorr, axis = 1)
print(flcorr.shape)
print(flcorr)

100%|██████████| 72/72 [00:00<00:00, 1389.26it/s]

(72,)
[0.39259245 0.455105   0.34080416 0.52255398 0.62151315 0.20416367
 0.21451703 0.26735442 0.1720577  0.17349259 0.156361   0.14841458
 0.14877108 0.10786582 0.25290941 0.1753263  0.39613385 0.39797063
 0.19262437 0.17274502 0.15113918 0.18979349 0.19007153 0.20424536
 0.21615553 0.25301846 0.23782989 0.23412046 0.23563233 0.25976841
 0.29118163 0.29181905 0.14150283 0.08336406 0.08448966 0.41979579
 0.43910157 0.30092829 0.32700052 0.39160592 0.37015563 0.43423607
 0.41363906 0.4311351  0.3875007  0.38729401 0.44657843 0.44568236
 0.28439564 0.24726389 0.07533552 0.41272181 0.45762954 0.38034559
 0.33505082 0.36325075 0.41640118 0.44445472 0.38522017 0.41046871
 0.3472166  0.36475614 0.40501012 0.37030581 0.3014735  0.20094647
 0.3119753  0.15834238 0.16830341 0.29468065 0.27354969 0.31446164]





## Base paper ka ACO, 10 features ka subset

In [44]:
tou = []
for i in tqdm.tqdm(range(d)):
    tou0 = 0
    for j in range(l):
        tou0 = max(tou0, 1 - spatial.distance.cosine(X_train[:, i], y_train[:, j]))
    tou.append(tou0)
tou = np.array(tou)
tou = (tou - np.min(tou)) / (np.max(tou) - np.min(tou))

q0 = 0.7
rho = 0.1
ants = 25
iterations = 40

for it in tqdm.tqdm(range(iterations)):
    fc = [0] * d
    for ant in range(ants):

        visited = set()
        unvisited = set(range(d))
        i = random.sample(list(unvisited), 1)[0]
        visited.add(i)
        unvisited.remove(i)

        while len(visited) < 10:
            ni = list(unvisited)
            p = []
            for j in ni:
                p.append(tou[j] * flcorr[j] / fcorr[i][j])
            p = np.array(p)
            p /= np.sum(p)
            q = random.random()
            j = -1
            if q >= q0:
                j = np.random.choice(ni, p = p)
            else:
                j = ni[np.argmax(p)]
            unvisited.remove(j)
            visited.add(j)
            i = j
        for i in visited:
            fc[i]+=1

    for i in range(d):
        tou[i] = (1 - rho) * tou[i] + fc[i] / sum(fc)

fs_ind = []
for i in range(d):
    fs_ind.append([tou[i], i])
fs_ind.sort(reverse = True)
fs_ind = np.array(fs_ind)
fs_ind = fs_ind[:10, 1].astype(int)

print(fs_ind)

X_train_subset = X_train[:, fs_ind]
X_test_subset = X_test[:, fs_ind]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

100%|██████████| 72/72 [00:00<00:00, 1941.67it/s]
100%|██████████| 40/40 [00:01<00:00, 38.17it/s]


[ 8 31 68 42 13 30 33 44 24 23]
X_train_subset: (355, 10) y_train: (355, 6)
X_test_subset: (238, 10) y_test: (238, 6)
lr train accu: 0.261
lr test_accu: 0.2447
nb train accu: 0.4528
nb test_accu: 0.3905
knn train accu: 0.5304
knn test_accu: 0.3561
avg train accu: 0.4147
avg test_accu: 0.3304


## Base paper ke ACO ka solution further optimized with random restructure LS with KNN wrapper

In [45]:
tmp = fs_ind[:]

In [46]:
X_train_subset = X_train[:, fs_ind]

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)
best_acc = score(y_train, knn.predict(X_train_subset))

ants = 40

for ant in tqdm.tqdm(range(ants)):

    visited = set(random.sample(fs_ind.tolist(), 5))
    unvisited = set(range(d)) - visited
    i = random.sample(list(visited), 1)[0]

    while len(visited) < 10:
        ni = list(unvisited)
        p = []
        for j in ni:
            p.append(tou[j] * flcorr[j] / fcorr[i][j])
        p = np.array(p)
        p /= np.sum(p)
        j = ni[np.argmax(p)]
        unvisited.remove(j)
        visited.add(j)
        i = j

    ind = np.array(list(visited))
    X_train_subset = X_train[:, ind]

    knn = BinaryRelevance(classifier = KNeighborsClassifier())
    knn.fit(X_train_subset, y_train)
    acc = score(y_train, knn.predict(X_train_subset))

    if acc > best_acc:
        best_acc = acc
        fs_ind = ind[:]

print(fs_ind)

X_train_subset = X_train[:, fs_ind]
X_test_subset = X_test[:, fs_ind]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

100%|██████████| 40/40 [00:03<00:00, 10.92it/s]


[33  4 68  8 42 13 23 28 30 31]
X_train_subset: (355, 10) y_train: (355, 6)
X_test_subset: (238, 10) y_test: (238, 6)
lr train accu: 0.3844
lr test_accu: 0.3694
nb train accu: 0.5039
nb test_accu: 0.4457
knn train accu: 0.5737
knn test_accu: 0.3725
avg train accu: 0.4873
avg test_accu: 0.3959


## Base paper ke ACO ka solution further optimized with random restructure LS with KNN + NB + LR wrapper

In [47]:
fs_ind = tmp[:]

In [48]:
X_train_subset = X_train[:, fs_ind]

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)
lr_train_score = score(y_train, lr.predict(X_train_subset))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)
nb_train_score = score(y_train, nb.predict(X_train_subset))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)
knn_train_score = score(y_train, knn.predict(X_train_subset))

best_acc = (lr_train_score + nb_train_score + knn_train_score) / 3

ants = 40

for ant in tqdm.tqdm(range(ants)):

    visited = set(random.sample(fs_ind.tolist(), 5))
    unvisited = set(range(d)) - visited
    i = random.sample(list(visited), 1)[0]

    while len(visited) < 10:
        ni = list(unvisited)
        p = []
        for j in ni:
            p.append(tou[j] * flcorr[j] / fcorr[i][j])
        p = np.array(p)
        p /= np.sum(p)
        j = ni[np.argmax(p)]
        unvisited.remove(j)
        visited.add(j)
        i = j

    ind = np.array(list(visited))
    X_train_subset = X_train[:, ind]

    lr = BinaryRelevance(classifier = LogisticRegression())
    lr.fit(X_train_subset, y_train)
    lr_train_score = score(y_train, lr.predict(X_train_subset))

    nb = BinaryRelevance(classifier = GaussianNB())
    nb.fit(X_train_subset, y_train)
    nb_train_score = score(y_train, nb.predict(X_train_subset))

    knn = BinaryRelevance(classifier = KNeighborsClassifier())
    knn.fit(X_train_subset, y_train)
    knn_train_score = score(y_train, knn.predict(X_train_subset))

    acc = (lr_train_score + nb_train_score + knn_train_score) / 3

    if acc > best_acc:
        best_acc = acc
        fs_ind = ind[:]

print(fs_ind)

X_train_subset = X_train[:, fs_ind]
X_test_subset = X_test[:, fs_ind]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

100%|██████████| 40/40 [00:10<00:00,  3.76it/s]


[34  4  8 11 44 14 55 23 24 31]
X_train_subset: (355, 10) y_train: (355, 6)
X_test_subset: (238, 10) y_test: (238, 6)
lr train accu: 0.3913
lr test_accu: 0.3673
nb train accu: 0.4953
nb test_accu: 0.473
knn train accu: 0.609
knn test_accu: 0.3918
avg train accu: 0.4985
avg test_accu: 0.4107
