## Getting indices and labels for all data (train & test)

In [1]:
import pandas as pd, numpy as np, random

# Fix random states
SEED = 42

np.random.seed(SEED)
random.seed(SEED)

In [2]:
# Dataset
from ast import literal_eval

pe = pd.read_csv("../data/kagdata/pre_embeddings_std_combined.csv", converters={'pre_embed_vec': literal_eval})
pe['pre_embed_vec'] = pe['pre_embed_vec'].apply(lambda x: np.array(x))

pre_embeddings = np.vstack(pe['pre_embed_vec'].values)
pre_embeddings.shape # NOTE: (1092, 981)

(1092, 981)

In [4]:
# Generate edge set & labels (y)
data = pd.read_csv("../data/kagdata/combined.csv")
train_edges = data[['pre requisite', 'concept']]
y = pd.read_csv("../data/kagdata/train.csv")['label'].values

# Using pe['title'] as node index and comparing with train_edges to generate edge_indices
edge_indices = np.c_[train_edges['pre requisite'].apply(lambda x: pe[pe['title'] == x].index[0]).values, train_edges['concept'].apply(lambda x: pe[pe['title'] == x].index[0]).values]
edge_indices

array([[410,  31],
       [759,  31],
       [478,  31],
       ...,
       [957, 877],
       [804, 604],
       [845,  49]])

In [5]:
edge_indices.shape

(3411, 2)

In [6]:
# Checking if edge_indices are correct - only shapes
recons_train_edges = np.array([[pe["title"][i], pe["title"][j]] for i, j in edge_indices])
np.all(train_edges == recons_train_edges), train_edges.shape, recons_train_edges.shape

(True, (3411, 2), (3411, 2))

In [7]:
y = np.concatenate([y, np.zeros(614)])
y.shape

(3411,)

In [8]:
# Generate [edge_indices, y]
labels = np.c_[edge_indices, y]
labels

array([[410.,  31.,   1.],
       [759.,  31.,   1.],
       [478.,  31.,   1.],
       ...,
       [957., 877.,   0.],
       [804., 604.,   0.],
       [845.,  49.,   0.]])

In [10]:
labels.shape

(3411, 3)

In [9]:
np.count_nonzero(labels[:, 2]) # NOTE: 1113 Positive labels & 2797 - 1113 = 1684 Negative labels <--- Training set

1113

In [53]:
# Save to disk
# np.save("../data/kagdata/edge_indices_labels_combined.npy", labels) # NOTE: Careful about overwriting