In [None]:
"""
MSGLasso.ipynb

Created on Wed Nov 29 2023

@author: Lukas

This file runs multivariate sparse group lasso on graph dataset properties
to predict model accuracy, following "A Metadata-Driven Approach to Understand Graph Neural Networks"
"""

In [1]:
# install asgl

!pip install asgl

Collecting asgl
  Downloading asgl-1.0.5.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: asgl
  Building wheel for asgl (setup.py) ... [?25l[?25hdone
  Created wheel for asgl: filename=asgl-1.0.5-py3-none-any.whl size=16002 sha256=fe104bce724a63851f14bfb4517f08f265cc9c8057737e54c57904552577550b
  Stored in directory: /root/.cache/pip/wheels/4b/ac/16/8caac90091e10a732feb3c240d6dcf472c4b0c7f28d2b96479
Successfully built asgl
Installing collected packages: asgl
Successfully installed asgl-1.0.5


In [3]:
# Import required packages
import asgl
import pandas as pd
import numpy as np

In [8]:
# Define parameters grid
lambda1 = (10.0 ** np.arange(-3, 1.51, 0.2)) # 23 possible values for lambda
alpha = np.arange(0, 1, 0.05) # 20 possible values for alpha

# Define model parameters
model = 'lm'  # linear model
penalization = 'sgl'  # sparse group lasso penalization
parallel = True  # Code executed in parallel
error_type = 'MSE'  # Error measuremente considered. MSE stands for Mean Squared Error.

In [9]:
# Define a cross validation object
cv_class = asgl.CV(model=model, penalization=penalization, lambda1=lambda1, alpha=alpha,
                   nfolds=5, error_type=error_type, parallel=parallel, random_state=99)

# Compute error using k-fold cross validation
error = cv_class.cross_validation(x=x, y=y, group_index=group_index)

# Obtain the mean error across different folds
error = np.mean(error, axis=1)

# Select the minimum error
minimum_error_idx = np.argmin(error)

# Select the parameters associated to mininum error values
optimal_parameters = cv_class.retrieve_parameters_value(minimum_error_idx)
optimal_lambda = optimal_parameters.get('lambda1')
optimal_alpha = optimal_parameters.get('alpha')



In [10]:
# Define asgl class using optimal values
asgl_model = asgl.ASGL(model=model, penalization=penalization, lambda1=optimal_lambda, alpha=optimal_alpha)

# Split data into train / test
train_idx, test_idx = asgl.train_test_split(nrows=x.shape[0], train_pct=0.7, random_state=1)

# Solve the model
asgl_model.fit(x=x[train_idx, :], y=y[train_idx], group_index=group_index)

# Obtain betas
final_beta_solution = asgl_model.coef_[0]

# Obtain predictions
final_prediction = asgl_model.predict(x_new=x[test_idx, :])

# Obtain final errors
final_error = asgl.error_calculator(y_true=y[test_idx], prediction_list=final_prediction, error_type=error_type)

**Record Graph Properties**

In [None]:
# load in the data

mutag = list(TUDataset(root="data", name="MUTAG"))
enzymes = list(TUDataset(root="data", name="ENZYMES"))
proteins = list(TUDataset(root="data", name="PROTEINS"))
imdb = list(TUDataset(root="data", name="IMDB-BINARY"))

In [None]:
# for each dataset, create a dataframe with the graph index and the properties, 
# i.e. edge density, average degree, degree assortativity, pseudo diameter, average clustering coefficient,
# transitivity, algebraic connectivity, curvature gap, and relative size of the largest clique

# MUTAG
mutag_df = pd.DataFrame(columns=['graph_index', 'edge_density', 'avg_degree', 'degree_assortativity', 'pseudo_diameter', 'avg_clustering_coeff', 'transitivity', 'algebraic_connectivity', 'curvature_gap', 'rel_size_largest_clique'])
for i in range(len(mutag)):
    mutag_df.loc[i] = # insert code here

# ENZYMES
enzymes_df = pd.DataFrame(columns=['graph_index', 'edge_density', 'avg_degree', 'degree_assortativity', 'pseudo_diameter', 'avg_clustering_coeff', 'transitivity', 'algebraic_connectivity', 'curvature_gap', 'rel_size_largest_clique'])
for i in range(len(enzymes)):
    enzymes_df.loc[i] = # insert code here

# PROTEINS
proteins_df = pd.DataFrame(columns=['graph_index', 'edge_density', 'avg_degree', 'degree_assortativity', 'pseudo_diameter', 'avg_clustering_coeff', 'transitivity', 'algebraic_connectivity', 'curvature_gap', 'rel_size_largest_clique'])
for i in range(len(proteins)):
    proteins_df.loc[i] = # insert code here

# IMDB
imdb_df = pd.DataFrame(columns=['graph_index', 'edge_density', 'avg_degree', 'degree_assortativity', 'pseudo_diameter', 'avg_clustering_coeff', 'transitivity', 'algebraic_connectivity', 'curvature_gap', 'rel_size_largest_clique'])
for i in range(len(imdb)):
    imdb_df.loc[i] = # insert code here

In [None]:
# save all dataframes to csv files
mutag_df.to_csv('mutag.csv')
enzymes_df.to_csv('enzymes.csv')
proteins_df.to_csv('proteins.csv')
imdb_df.to_csv('imdb.csv')

In [None]:
# load in the accuracy dictionaryies and add them to the dataframes

**Regression**

In [36]:
# set hyperparameters

alpha_opt = 0.5
lambda_opt = 0.002
group_idx = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3])

In [48]:
# fit the model

model = asgl.ASGL(model='lm', penalization='sgl', lambda1=lambda_opt, alpha=alpha_opt)
asgl_model.fit(x=scaled_x, y=scaled_y[:, 0], group_index=group_idx)

In [50]:
# get the coefficients

asgl_model.coef_[0]

array([ 2.64046511e+02,  0.00000000e+00,  1.24063100e+00,  4.09944460e+00,
       -1.23699372e+01,  1.18208572e+02,  2.70692476e+01,  3.44738482e+01,
       -1.86345734e-01, -4.87576259e+01,  2.13417824e+02,  0.00000000e+00,
       -1.42752151e+02,  9.38657064e+01,  1.02056033e+02, -2.46636576e+02])