## To generate pre_embeddings for combined `train.csv` & `test.csv`

In [2]:
import pandas as pd, numpy as np

In [3]:
# Dataset - NOTE: Train data
data_path = "../data/kagdata/train.csv"
data = pd.read_csv(data_path)

# NOTE: Test data
tdata_path = "../data/kagdata/test.csv"
tdata = pd.read_csv(tdata_path)

In [4]:
data = data.drop(columns=['label', 'pre requisite taxonomy', 'concept taxonomy']) # Removing these columns (all strings except label)
# data

In [5]:
tdata = tdata.drop(columns=['ID', 'pre requisite taxonomy', 'concept taxonomy']) # Removing these columns (all strings except label)
# tdata

* There are some common entries between "concept" & "pre requisite".

In [6]:
# NOTE: Checking if any of the concepts are in the pre-requisites & vice versa
cons = data['concept'].unique() # 666 concepts
prs = data['pre requisite'].unique() # 439 pre requisites

# NOTE: Some common entries between "concept" & "pre requisite"
assert not np.all([c not in prs for c in cons])

In [7]:
# NOTE: Checking if any of the concepts are in the pre-requisites & vice versa
tcons = tdata['concept'].unique() # 475 concepts
tprs = tdata['pre requisite'].unique() # 461 pre requisites

# NOTE: Some common entries between "concept" & "pre requisite"
assert not np.all([c not in tprs for c in tcons])

In [8]:
# NOTE: There are edges (entries) with permuted concepts & pre-requisites
def get_rev_edge_count(dat):
    """
    Returns the number of permuted edges in train.csv / test.csv

    """
    arr1 = dat[['concept', 'pre requisite']].values.astype(str)
    arr2 = dat[['pre requisite', 'concept']].values.astype(str)

    # https://stackoverflow.com/a/67113105/11922029
    m = (arr1[:, None] == arr2).all(-1).any(1)
    rev_counts = np.where(m)[0].shape

    return rev_counts

print("Training\tTesting")
print(get_rev_edge_count(data), "\t\t", get_rev_edge_count(tdata)) # 46, 208 NOTE: number of entries with permuted videos

Training	Testing
(46,) 		 (208,)


## Merging the two dataframes

In [9]:
dat = pd.concat([data, tdata], axis=0, ignore_index=True)
dat.shape

(3411, 983)

In [10]:
# NOTE: Merging dataframes and saving to disk - Used later to get node indices in inds_lbls.ipynb 
# dat.to_csv("../data/kagdata/combined.csv", index=False) # NOTE: Careful about overwriting

* Test `dat` in the same manner as `data` (train) & `tdata` (test) above.

In [11]:
# NOTE: Checking if any of the concepts are in the pre-requisites & vice versa
cons = dat['concept'].unique() # 881 concepts
prs = dat['pre requisite'].unique() # 720 pre requisites

# NOTE: Some common entries between "concept" & "pre requisite"
assert not np.all([c not in prs for c in cons])

# NOTE: Some common entries between "concept" & "pre requisite"
print(get_rev_edge_count(dat)) # 262 = 46 + 208 + 8 NOTE: number of entries with permuted videos. There are 8 permuted edges between train.csv & test.csv.

(262,)


### For each concept (fixed), aggregate over pre requisites

In [12]:
shared = np.intersect1d(cons, prs) # 509
cons_uniq = np.setdiff1d(cons, shared) # 372
prs_uniq = np.setdiff1d(prs, shared) # 211

overall = np.concatenate([cons_uniq, prs_uniq, shared])
sum((cons_uniq.shape[0], prs_uniq.shape[0], shared.shape[0])), overall.shape
# NOTE: 1 entry in metadata is not used at all in train.csv

(1092, (1092,))

In [13]:
title_pre_embed = []
for title in cons_uniq:
    # Simple summed aggregate over all "pre requisites" for this "concept"
    pre_embed = dat[dat['concept'] == title].drop(columns=['concept', 'pre requisite']).sum(axis=0)

    title_pre_embed.append([title, list(pre_embed.values)])

for title in prs_uniq:
    # Simple summed aggregate over all "concepts" for this "pre requisite"
    pre_embed = dat[dat['pre requisite'] == title].drop(columns=['concept', 'pre requisite']).sum(axis=0)

    title_pre_embed.append([title, list(pre_embed.values)])

for title in shared:
    # Simple summed aggregate over all "concepts" for this "pre requisite"
    con_pre_embed = dat[dat['concept'] == title].drop(columns=['concept', 'pre requisite']).sum(axis=0)
    # Simple summed aggregate over all "pre requisites" for this "concept"
    pr_pre_embed = dat[dat['pre requisite'] == title].drop(columns=['concept', 'pre requisite']).sum(axis=0)

    title_pre_embed.append([title, list((con_pre_embed + pr_pre_embed).values)])

In [14]:
pre_embeddings = pd.DataFrame(title_pre_embed)
pre_embeddings.columns = ['title', 'pre_embed_vec']

In [16]:
# NOTE: Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# scaler = MinMaxScaler()
scaler = StandardScaler()

# Run scaler on each row in pre_embeddings['pre_embed_vec']
pre_embeddings['pre_embed_vec'] = pre_embeddings['pre_embed_vec'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)))

# Convert back to list
pre_embeddings['pre_embed_vec'] = pre_embeddings['pre_embed_vec'].apply(lambda x: x.reshape(-1).tolist())

# Save to csv | NOTE: 981-length un/scaled vector for each title
# pre_embeddings.to_csv("../data/kagdata/pre_embeddings_std_combined.csv", index=False) # NOTE: Careful about overwriting