# Edge Prediction
Run this notebook to train a logistic regression model to predict edges. You must first run the `construct_features.ipynb` notebook to create the edge features.

In [1]:
import os
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Whether to generate predictions for intermediate and final test sets
intermediate = True
final = True

# Number of communities for the CHIP model
n_communities_A = 3
n_communities_B = 110

# Data paths
datasetA_path = '.'
datasetB_path = '.'
featureA_path = '.'
featureB_path = '.'
predictionA_path = '.'
predictionB_path = '.'

dataA_init_path = os.path.join(datasetA_path, 'input_A_initial.csv')
dataB_init_path = os.path.join(datasetB_path, 'input_B_initial.csv')

featureA_CHIP_init_path = os.path.join(featureA_path, f'featureA_CHIP_K_{n_communities_A}_initial.csv')
featureA_CHIP_inter_path = os.path.join(featureA_path, f'featureA_CHIP_K_{n_communities_A}_intermediate.csv')
featureA_CHIP_final_path = os.path.join(featureA_path, f'featureA_CHIP_K_{n_communities_A}_final.csv')
featureA_edge_init_path = os.path.join(featureA_path, f'featureA_edge_initial.csv')
featureA_edge_inter_path = os.path.join(featureA_path, f'featureA_edge_intermediate.csv')
featureA_edge_final_path = os.path.join(featureA_path, f'featureA_edge_final.csv')
featureB_CHIP_init_path = os.path.join(featureB_path, f'featureB_CHIP_K_{n_communities_B}_initial.csv')
featureB_CHIP_inter_path = os.path.join(featureB_path, f'featureB_CHIP_K_{n_communities_B}_intermediate.csv')
featureB_CHIP_final_path = os.path.join(featureB_path, f'featureB_CHIP_K_{n_communities_B}_final.csv')

predictionA_inter_path = os.path.join(predictionA_path, 'output_A_intermediate.csv')
predictionA_final_path = os.path.join(predictionA_path, 'output_A.csv')
predictionB_inter_path = os.path.join(predictionB_path, 'output_B_intermediate.csv')
predictionB_final_path = os.path.join(predictionB_path, 'output_B.csv')

# Make printed arrays easier to view
np.set_printoptions(precision=4, suppress=True)

# Dataset A

## Load and preprocess data

In [2]:
# Load data and standardize
dataA_init = np.loadtxt(dataA_init_path, delimiter=',')
labelsA_init = dataA_init[:, -1]
featureA_CHIP_init = np.loadtxt(featureA_CHIP_init_path, delimiter=',')
featureA_edge_init = np.loadtxt(featureA_edge_init_path, delimiter=',')

# Assemble feature matrix
featureA_init = np.hstack((featureA_edge_init, featureA_CHIP_init[:, np.newaxis]))

scalerA = StandardScaler()
featureA_init = scalerA.fit_transform(featureA_init)

if intermediate:
    featureA_CHIP_inter = np.loadtxt(featureA_CHIP_inter_path, delimiter=',')
    featureA_edge_inter = np.loadtxt(featureA_edge_inter_path, delimiter=',')
    # Assemble feature matrix
    featureA_inter = np.hstack((featureA_edge_inter, featureA_CHIP_inter[:, np.newaxis]))
    featureA_inter = scalerA.transform(featureA_inter)

if final:
    featureA_CHIP_final = np.loadtxt(featureA_CHIP_final_path, delimiter=',')
    featureA_edge_final = np.loadtxt(featureA_edge_final_path, delimiter=',')
    # Assemble feature matrix
    featureA_final = np.hstack((featureA_edge_final, featureA_CHIP_final[:, np.newaxis]))
    featureA_final = scalerA.transform(featureA_final)

## Train logistic regression and make predictions

In [3]:
# Train logistic regression on initial data
lr_args = {'penalty': 'l2', 'C': 0.01, 'solver': 'lbfgs'}
lrA = LogisticRegression(**lr_args)
lrA.fit(featureA_init, labelsA_init)

if intermediate:
    predictionA_inter = lrA.predict_proba(featureA_inter)[:, -1]
    np.savetxt(predictionA_inter_path, predictionA_inter, fmt='%.16g')

if final:
    predictionA_final = lrA.predict_proba(featureA_final)[:, -1]
    np.savetxt(predictionA_final_path, predictionA_final, fmt='%.16g')

# Dataset B

## Load and preprocess data

In [4]:
# Load data and standardize
dataB_init = np.loadtxt(dataB_init_path, delimiter=',')
labelsB_init = dataB_init[:, -1]
featureB_init = np.loadtxt(featureB_CHIP_init_path, delimiter=',')

# Change from 1-D to 2-D array since there is only a single feature
featureB_init = featureB_init[:, np.newaxis]

scalerB = StandardScaler()
featureB_init = scalerB.fit_transform(featureB_init)

if intermediate:
    featureB_inter = np.loadtxt(featureB_CHIP_inter_path, delimiter=',')
    # Change from 1-D to 2-D array since there is only a single feature
    featureB_inter = featureB_inter[:, np.newaxis]
    featureB_inter = scalerB.transform(featureB_inter)

if final:
    featureB_final = np.loadtxt(featureB_CHIP_final_path, delimiter=',')
    # Change from 1-D to 2-D array since there is only a single feature
    featureB_final = featureB_final[:, np.newaxis]
    featureB_final = scalerB.transform(featureB_final)

## Train logistic regression and make predictions

In [5]:
# Train logistic regression on initial data
lr_args = {'penalty': 'l2', 'C': 0.001, 'solver': 'lbfgs'}
lrB = LogisticRegression(**lr_args)
lrB.fit(featureB_init, labelsB_init)

if intermediate:
    predictionB_inter = lrB.predict_proba(featureB_inter)[:, -1]
    np.savetxt(predictionB_inter_path, predictionB_inter, fmt='%.16g')

if final:
    predictionB_final = lrB.predict_proba(featureB_final)[:, -1]
    np.savetxt(predictionB_final_path, predictionB_final, fmt='%.16g')