# Feature Construction
Run this notebook to create the edge features that will be used to train a logistic regression model to predict edges. The edge features will be saved as CSV files to be loaded in the `predict_edges.ipynb` notebook.

In [None]:
import os
import numpy as np

# Whether to construct features for intermediate and final test sets
intermediate = False
final = True

# Number of communities for the CHIP model
n_communities_A = 3
n_communities_B = 110

# Data paths
datasetA_path = '.'
datasetB_path = '.'
featureA_path = '.'
featureB_path = '.'

dataA_train_path = os.path.join(datasetA_path, 'edges_train_A.csv')
dataA_node_features_path = os.path.join(datasetA_path, 'node_features.csv')
dataA_edge_type_features_path = os.path.join(datasetA_path, 'edge_type_features.csv')
dataA_init_path = os.path.join(datasetA_path, 'input_A_initial.csv')
dataA_inter_path = os.path.join(datasetA_path, 'input_A_intermediate.csv')
dataA_final_path = os.path.join(datasetA_path, 'input_A.csv')
dataB_train_path = os.path.join(datasetB_path, 'edges_train_B.csv')
dataB_init_path = os.path.join(datasetB_path, 'input_B_initial.csv')
dataB_inter_path = os.path.join(datasetB_path, 'input_B_intermediate.csv')
dataB_final_path = os.path.join(datasetB_path, 'input_B.csv')

featureA_edge_init_path = os.path.join(featureA_path, 'featureA_edge_initial.csv')
featureA_edge_inter_path = os.path.join(featureA_path, 'featureA_edge_intermediate.csv')
featureA_edge_final_path = os.path.join(featureA_path, 'featureA_edge_final.csv')

featureA_CHIP_init_path = os.path.join(featureA_path, f'featureA_CHIP_K_{n_communities_A}_initial.csv')
featureA_CHIP_inter_path = os.path.join(featureA_path, f'featureA_CHIP_K_{n_communities_A}_intermediate.csv')
featureA_CHIP_final_path = os.path.join(featureA_path, f'featureA_CHIP_K_{n_communities_A}_final.csv')
featureB_CHIP_init_path = os.path.join(featureB_path, f'featureB_CHIP_K_{n_communities_B}_initial.csv')
featureB_CHIP_inter_path = os.path.join(featureB_path, f'featureB_CHIP_K_{n_communities_B}_intermediate.csv')
featureB_CHIP_final_path = os.path.join(featureB_path, f'featureB_CHIP_K_{n_communities_B}_final.csv')

# Dataset A

## Construct node and edge type similarity features (fast)
These similarity-based features use only the provided node features, edge type features, and the frequency of training edges between a node pair. They took about 5 min to compute.

In [None]:
from edge_features import load_dataset, load_node_features, load_edge_type_features, compute_edge_frequencies, \
    create_node_similarities, create_edge_type_similarities, create_node_sim_features, create_edge_type_sim_features

# Create node and edge type similarities from node and edge type features and training data
node_features = load_node_features(dataA_node_features_path)
print('Node Features Done')
edge_features = load_edge_type_features(dataA_edge_type_features_path)
print('Edge Features Done')
edge_dist = compute_edge_frequencies(dataA_train_path)
print('Edge Dist Done')

node_similarities = create_node_similarities(node_features)
print('Node similarities Done')
edge_similarities = create_edge_type_similarities(edge_features)
print('Edge type similarities Done')

dataA_init = load_dataset(dataA_init_path)
print('Load initial A Done')

node_sim_feat_initA = create_node_sim_features(node_similarities, dataA_init)
print('Node features initial Done')
edge_type_sim_feat_initA = create_edge_type_sim_features(edge_similarities, edge_dist, dataA_init)
print('Edge type features initial Done')
np.savetxt(featureA_edge_init_path, np.stack((node_sim_feat_initA, edge_type_sim_feat_initA), axis=-1), delimiter=',')

if intermediate:
    dataA_inter = load_dataset(dataA_inter_path)
    print('Load intermediate A done')
    node_sim_feat_interA = create_node_sim_features(node_similarities, dataA_inter)
    print('Node features final Done')
    edge_type_sim_feat_interA = create_edge_type_sim_features(edge_similarities, edge_dist, dataA_inter)
    print('Edge type features final Done')
    np.savetxt(featureA_edge_inter_path, np.stack((node_sim_feat_interA, edge_type_sim_feat_interA), axis=-1), 
               delimiter=',')

if final:
    dataA_final = load_dataset(dataA_final_path)
    print('Load final A done')
    node_sim_feat_finalA = create_node_sim_features(node_similarities, dataA_final)
    print('Node features final Done')
    edge_type_sim_feat_finalA = create_edge_type_sim_features(edge_similarities, edge_dist, dataA_final)
    print('Edge type features final Done')
    np.savetxt(featureA_edge_final_path, np.stack((node_sim_feat_finalA, edge_type_sim_feat_finalA), axis=-1), 
               delimiter=',')

## Construct CHIP feature (medium)
This feature involves fitting the CHIP generative model to all training edges while ignoring the edge types. It took about 30 min to compute.

In [None]:
from chip_features import create_dictionary_chip_A, fit_chip_A, create_feature_chip_A

# # create and save dictionary of events for CHIP
# dataA_train_tuple = create_dictionary_chip_A(dataA_train_path, savepath=os.path.join(featureA_path, 'datasetA_train_chip.p'))
# print('Finished creating event dictionary')
# fit CHIP model and save parameters
fitA_params = fit_chip_A(dataA_train_tuple, K=n_communities_A, 
                         savepath=os.path.join(featureA_path, f'fitA_chip_K_{n_communities_A}.p'))
print('Finished fitting CHIP')

# compute CHIP features on initial dataset
dataA_init_np = np.loadtxt(dataA_init_path, np.int, delimiter=',', usecols=(0, 1, 2, 3, 4))
featureA_CHIP_init = create_feature_chip_A(dataA_train_tuple, dataA_init_np, fitA_params, savepath=featureA_CHIP_init_path)
print('CHIP features initial done')

if intermediate:
    dataA_inter_np = np.loadtxt(dataA_inter_path, np.int, delimiter=',', usecols=(0, 1, 2, 3, 4))
    featureA_CHIP_inter = create_feature_chip_A(dataA_train_tuple, dataA_inter_np, fitA_params,
                                                savepath=featureA_CHIP_inter_path)
    print('CHIP features intermediate done')

if final:
    dataA_final_np = np.loadtxt(dataA_final_path, np.int, delimiter=',', usecols=(0, 1, 2, 3, 4))
    featureA_CHIP_final = create_feature_chip_A(dataA_train_tuple, dataA_final_np, fitA_params,
                                                savepath=featureA_CHIP_final_path)
    print('CHIP features final done')

# Dataset B

## Construct CHIP feature (slow)
This feature involves fitting the CHIP generative model to all training edges while ignoring the edge types. It took about 2.5 hours to compute.

In [None]:
from chip_features import create_dictionary_chip_B, fit_chip_B, create_feature_chip_B

# create and save dictionary of events for CHIP
dataB_train_tuple = create_dictionary_chip_B(dataB_train_path, savepath=os.path.join(featureB_path, 'datasetB_train_chip.p'))
print('Finished creating event dictionary')
# fit CHIP model and save parameters
fitB_params = fit_chip_B(dataB_train_tuple, K=n_communities_B, 
                         savepath=os.path.join(featureB_path, f'fitB_chip_K_{n_communities_B}.p'))
print('Finished fitting CHIP')

# compute CHIP features on initial dataset
dataB_init = np.loadtxt(dataB_init_path, np.int, delimiter=',', usecols=(0, 1, 2, 3, 4))
featureB_CHIP_init = create_feature_chip_B(dataB_train_tuple, dataB_init, fitB_params, savepath=featureB_CHIP_init_path)
print('CHIP features initial done')

if intermediate:
    dataB_inter = np.loadtxt(dataB_inter_path, np.int, delimiter=',', usecols=(0, 1, 2, 3, 4))
    featureB_CHIP_inter = create_feature_chip_B(dataB_train_tuple, dataB_inter, fitB_params,
                                                savepath=featureB_CHIP_inter_path)
    print('CHIP features intermediate done')

if final:
    dataB_final = np.loadtxt(dataB_final_path, np.int, delimiter=',', usecols=(0, 1, 2, 3, 4))
    featureB_CHIP_final = create_feature_chip_B(dataB_train_tuple, dataB_final, fitB_params,
                                                savepath=featureB_CHIP_final_path)
    print('CHIP features final done')
