In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 

## Synthetic Dataset
First process the synthetic dataset results

In [2]:
RESULTS_DIR = 'results_sym'

synth_scat = pd.read_csv(os.path.join(RESULTS_DIR, 'synthetic_results_pca_1.0.csv'))
print(synth_scat.columns)

synth_gnn = pd.read_csv(os.path.join(RESULTS_DIR, 'synthetic_GNN.csv'))
print(synth_gnn.columns)

Index(['scattering_type', 'sub_dataset', 'model', 'score', 'stdev', 'ncomp',
       'pca_var'],
      dtype='object')
Index(['sub_dataset', 'model', 'acc', 'stdev', 'hidden_dim', 'learning_rate'], dtype='object')


In [3]:
synth_scat_gaussian = synth_scat[synth_scat['sub_dataset'].str.startswith("gaussian_pm")]
synth_scat_camel = synth_scat[synth_scat['sub_dataset'].str.startswith("camel_pm")]
print('W2 wavelet')
for name, df in zip(['gaussian', 'camel'],[synth_scat_gaussian, synth_scat_camel]):
    grouped = df.groupby(['scattering_type', 'model'])['score']
    mean_scores = grouped.mean().reset_index()
    std_deviation = grouped.std().reset_index()

    # Convert to percentages (without the percent sign) and format
    formatted_results = mean_scores.copy()
    formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                                " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
    print(name)
    print(formatted_results)

W2 wavelet
gaussian
  scattering_type model           score
0            blis   MLP  $99.5 \pm 0.3$
1         modulus   MLP  $81.3 \pm 7.3$
camel
  scattering_type model           score
0            blis   MLP  $98.6 \pm 0.4$
1         modulus   MLP  $96.4 \pm 1.3$


In [4]:
synth_scat_W1 = pd.read_csv(os.path.join(RESULTS_DIR, 'synthetic_results_pca_1.0_W1.csv'))
synth_scat_gaussian = synth_scat_W1[synth_scat_W1['sub_dataset'].str.startswith("gaussian_pm")]
synth_scat_camel = synth_scat_W1[synth_scat_W1['sub_dataset'].str.startswith("camel_pm")]
print('W1 Wavelet')
for name, df in zip(['gaussian', 'camel'],[synth_scat_gaussian, synth_scat_camel]):
    grouped = df.groupby(['scattering_type', 'model'])['score']
    mean_scores = grouped.mean().reset_index()
    std_deviation = grouped.std().reset_index()

    # Convert to percentages (without the percent sign) and format
    formatted_results = mean_scores.copy()
    formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                                " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
    print(name)
    print(formatted_results)

W1 Wavelet
gaussian
  scattering_type model            score
0            blis   MLP  $100.0 \pm 0.0$
1         modulus   MLP   $92.8 \pm 3.4$
camel
  scattering_type model           score
0            blis   MLP  $97.7 \pm 0.5$
1         modulus   MLP  $97.1 \pm 1.3$


In [5]:
synth_gnn_gaussian = synth_gnn[synth_gnn['sub_dataset'].str.startswith("gaussian_pm")]
synth_gnn_camel = synth_gnn[synth_gnn['sub_dataset'].str.startswith("camel_pm")]

for name, df in zip(['gaussian', 'camel'], [synth_gnn_gaussian, synth_gnn_camel]):
    grouped = df.groupby(['model'])['acc']
    mean_scores = grouped.mean().reset_index()
    std_deviation = grouped.std().reset_index()

    # Convert to percentages (without the percent sign) and format
    formatted_results = mean_scores.copy()
    formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                                " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"
    print(name)
    print(formatted_results)

gaussian
  model             acc
0   GAT  $99.2 \pm 0.5$
1   GCN  $99.0 \pm 0.4$
2   GIN  $99.5 \pm 0.2$
camel
  model             acc
0   GAT  $91.6 \pm 2.0$
1   GCN  $91.7 \pm 2.0$
2   GIN  $91.3 \pm 1.4$


In [6]:
GPS_pd_synthetic = pd.read_csv(os.path.join(RESULTS_DIR, 'GPS_synthetic_GNN.csv'))
synth_gnn_gaussian = GPS_pd_synthetic[GPS_pd_synthetic['sub_dataset'].str.startswith("gaussian_pm")]
synth_gnn_camel = GPS_pd_synthetic[GPS_pd_synthetic['sub_dataset'].str.startswith("camel_pm")]

for name, df in zip(['gaussian', 'camel'], [synth_gnn_gaussian, synth_gnn_camel]):
    grouped = df.groupby(['model'])['acc']
    mean_scores = grouped.mean().reset_index()
    std_deviation = grouped.std().reset_index()

    # Convert to percentages (without the percent sign) and format
    formatted_results = mean_scores.copy()
    formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                                " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"
    print(name)
    print(formatted_results)

gaussian
  model             acc
0   GPS  $95.4 \pm 5.9$
camel
  model             acc
0   GPS  $97.7 \pm 0.9$


In [36]:
# process synthetic with skips
print("scattering and blis with layers 1,2,3")
synthetic_W1_skips = pd.read_csv( os.path.join(RESULTS_DIR, 'synthetic_results_pca_1.0_W1_skips.csv' ) )
synthetic_W2_skips = pd.read_csv( os.path.join(RESULTS_DIR, 'synthetic_results_pca_1.0_W2_skips.csv' ) )

synth_skips = pd.concat([synthetic_W1_skips, synthetic_W2_skips], axis = 0).reset_index(drop=True)

synth_skips_gaussian = synth_skips[synth_skips['sub_dataset'].str.startswith("gaussian_pm")]
synth_skips_camel = synth_skips[synth_skips['sub_dataset'].str.startswith("camel_pm")]
for name, df in zip(['gaussian', 'camel'],[synth_skips_gaussian, synth_skips_camel]):
    grouped = df.groupby(['scattering_type', 'wavelet_type'])['score']
    mean_scores = grouped.mean().reset_index()
    std_deviation = grouped.std().reset_index()

    # Convert to percentages (without the percent sign) and format
    formatted_results = mean_scores.copy()
    formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                                " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
    print(name)
    print(formatted_results)


scattering and blis with layers 1,2,3
gaussian
  scattering_type wavelet_type            score
0            blis           W1  $100.0 \pm 0.0$
1            blis           W2   $99.3 \pm 0.2$
2         modulus           W1   $99.9 \pm 0.2$
3         modulus           W2   $89.7 \pm 3.6$
camel
  scattering_type wavelet_type           score
0            blis           W1  $97.6 \pm 0.7$
1            blis           W2  $98.4 \pm 0.8$
2         modulus           W1  $96.7 \pm 1.1$
3         modulus           W2  $96.9 \pm 1.0$


In [37]:
# process synthetic with skips
print("scattering and blis with layers 1,2")
synthetic_W1_skips = pd.read_csv( os.path.join(RESULTS_DIR, 'synthetic_results_pca_1.0_W1_skips2.csv' ) )
synthetic_W2_skips = pd.read_csv( os.path.join(RESULTS_DIR, 'synthetic_results_pca_1.0_W2_skips2.csv' ) )

synth_skips = pd.concat([synthetic_W1_skips, synthetic_W2_skips], axis = 0).reset_index(drop=True)

synth_skips_gaussian = synth_skips[synth_skips['sub_dataset'].str.startswith("gaussian_pm")]
synth_skips_camel = synth_skips[synth_skips['sub_dataset'].str.startswith("camel_pm")]
for name, df in zip(['gaussian', 'camel'],[synth_skips_gaussian, synth_skips_camel]):
    grouped = df.groupby(['scattering_type', 'wavelet_type'])['score']
    mean_scores = grouped.mean().reset_index()
    std_deviation = grouped.std().reset_index()

    # Convert to percentages (without the percent sign) and format
    formatted_results = mean_scores.copy()
    formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                                " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
    print(name)
    print(formatted_results)

scattering and blis with layers 1,2
gaussian
  scattering_type wavelet_type            score
0            blis           W1  $100.0 \pm 0.0$
1            blis           W2   $99.7 \pm 0.2$
2         modulus           W1   $97.7 \pm 1.0$
3         modulus           W2   $88.3 \pm 4.3$
camel
  scattering_type wavelet_type           score
0            blis           W1  $96.7 \pm 0.8$
1            blis           W2  $97.4 \pm 0.9$
2         modulus           W1  $96.5 \pm 1.2$
3         modulus           W2  $96.8 \pm 1.0$


## Partly cloudy dataset
Next process the partly cloudy dataset

In [7]:
pc_scat = pd.read_csv(os.path.join(RESULTS_DIR, 'partly_cloudy_results_scattering_1.0.csv'))
print(pc_scat.columns)

pc_gnn = pd.read_csv(os.path.join(RESULTS_DIR, 'partly_cloudy_GNN.csv'))
partly_cloudy_gnn = pd.read_csv(os.path.join(RESULTS_DIR,'partly_cloudy_GPS.csv'))
pc_gnn = pd.concat([pc_gnn, partly_cloudy_gnn], ignore_index=True)
print(pc_gnn.columns)

Index(['scattering_type', 'sub_dataset', 'model', 'score', 'stdev', 'ncomp',
       'pca_var'],
      dtype='object')
Index(['sub_dataset', 'model', 'acc', 'stdev', 'hidden_dim', 'learning_rate'], dtype='object')


In [8]:
print('partly cloudy scattering W2')
grouped = pc_scat.groupby(['scattering_type', 'model'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

partly cloudy scattering W2
  scattering_type model           score
0            blis   MLP  $41.5 \pm 5.5$
1         modulus   MLP  $39.0 \pm 5.8$


In [9]:
grouped = pc_gnn.groupby(['model'])['acc']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"
print('partly cloudy GNNs')
print(formatted_results)

partly cloudy GNNs
  model             acc
0   GAT  $37.3 \pm 4.8$
1   GCN  $37.5 \pm 4.9$
2   GIN  $37.1 \pm 4.5$
3   GPS  $42.0 \pm 4.3$


In [10]:

partly_cloudy_gnn.columns

Index(['sub_dataset', 'model', 'acc', 'stdev', 'hidden_dim', 'learning_rate'], dtype='object')

Now I'll still process the partly cloudy data, but with the new sets of labels


In [10]:
labels = ["GUS", "MENTAL", "PAIN", "PECK"]
wavelet_types = ['W1', "W2"]
scattering_base_name = "partly_cloudy_results_scattering_1.0_"
gnn_base_name = "partly_cloudy_"

gnn_dfs = []
# load the gnn dfs first
for label in labels:
    gnn_df = pd.read_csv(os.path.join(RESULTS_DIR, gnn_base_name + label + '.csv'))
    # create a new column in the dataframe called label and set all of its values to 
    gnn_df['label'] = label
    gnn_dfs.append(gnn_df)
gnn_df_full = pd.concat(gnn_dfs, axis = 0).reset_index(drop=True)
gnn_df_full
#pc_scat_new_labels = pd.read_csv(os.path.join(RESULTS_DIR, 'partly_cloudy_results_scattering_1.0.csv'))

grouped = gnn_df_full.groupby(['label','model'])['acc']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"
print('partly cloudy GNNs')
print(formatted_results)

partly cloudy GNNs
     label model             acc
0      GUS   GAT  $62.0 \pm 3.0$
1      GUS   GCN  $61.9 \pm 2.9$
2      GUS   GIN  $59.4 \pm 3.7$
3      GUS   GPS  $57.7 \pm 4.4$
4   MENTAL   GAT  $85.5 \pm 0.0$
5   MENTAL   GCN  $85.5 \pm 0.0$
6   MENTAL   GIN  $85.4 \pm 0.4$
7   MENTAL   GPS  $80.1 \pm 3.0$
8     PAIN   GAT  $84.3 \pm 0.0$
9     PAIN   GCN  $84.3 \pm 0.0$
10    PAIN   GIN  $84.2 \pm 0.3$
11    PAIN   GPS  $78.4 \pm 3.3$
12    PECK   GAT  $57.8 \pm 3.8$
13    PECK   GCN  $57.6 \pm 3.8$
14    PECK   GIN  $56.8 \pm 4.5$
15    PECK   GPS  $58.5 \pm 4.7$


In [11]:
df = pd.read_csv(os.path.join(RESULTS_DIR, scattering_base_name+ 'W1_GUS.csv'))
df.columns

Index(['scattering_type', 'sub_dataset', 'model', 'score', 'stdev', 'ncomp',
       'task', 'pca_var'],
      dtype='object')

In [12]:
# now do the same but for the wavelet classifiers 
pc_scat_dfs = [] 
for label in labels:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, scattering_base_name + wavelet_type + '_'+label+'.csv'))
        df['wavelet'] = wavelet_type 
        df['label'] = label 
        pc_scat_dfs.append(df)

pc_scat_dfs_full = pd.concat(pc_scat_dfs, axis = 0).reset_index(drop=True) 
grouped = pc_scat_dfs_full.groupby(["label",'scattering_type', "wavelet",'model'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

     label scattering_type wavelet model           score
0      GUS            blis      W1   MLP  $56.6 \pm 5.4$
1      GUS            blis      W2   MLP  $57.2 \pm 4.8$
2      GUS         modulus      W1   MLP  $54.6 \pm 4.7$
3      GUS         modulus      W2   MLP  $55.5 \pm 5.1$
4   MENTAL            blis      W1   MLP  $79.3 \pm 3.0$
5   MENTAL            blis      W2   MLP  $79.7 \pm 3.0$
6   MENTAL         modulus      W1   MLP  $79.7 \pm 2.8$
7   MENTAL         modulus      W2   MLP  $80.1 \pm 2.6$
8     PAIN            blis      W1   MLP  $77.4 \pm 3.2$
9     PAIN            blis      W2   MLP  $77.9 \pm 3.1$
10    PAIN         modulus      W1   MLP  $78.3 \pm 2.6$
11    PAIN         modulus      W2   MLP  $78.7 \pm 2.9$
12    PECK            blis      W1   MLP  $56.9 \pm 5.2$
13    PECK            blis      W2   MLP  $56.8 \pm 5.5$
14    PECK         modulus      W1   MLP  $54.2 \pm 5.1$
15    PECK         modulus      W2   MLP  $54.6 \pm 5.9$


Process the partly cloudy data with smoothing

In [45]:
# read in GNN data 
print("partly cloudy EMOTION 3 with gaussian smoothing")
models = ['GAT', 'GCN', 'GIN', 'GPS'] # ,'GPS' 
cloudy_gnns = []
for model in models:
    file_name = f'partly_cloudy_EMOTION3_{model}_smoothed.csv'
    df = pd.read_csv(os.path.join(RESULTS_DIR, file_name))
    cloudy_gnns.append(df) 

pc_gnn_smooth = pd.concat(cloudy_gnns, axis = 0).reset_index(drop = True)
grouped = pc_gnn_smooth.groupby(['model'])['acc']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"
print('partly cloudy GNNs')
print(formatted_results)


partly cloudy EMOTION 3 with gaussian smoothing
partly cloudy GNNs
  model             acc
0   GAT  $39.3 \pm 6.0$
1   GCN  $39.3 \pm 5.9$
2   GIN  $42.1 \pm 6.0$
3   GPS  $56.4 \pm 4.3$


In [44]:
# read in scattering data 
print("partly cloudy EMOTION 3 scattering classification w gaussian smoothing")
scattering_types = ['blis', 'modulus'] 
wavelets = ['W1', 'W2']

pc_smooth_scattering = []
for scattering_type in scattering_types:
    for wavelet in wavelets: 
        if scattering_type == 'blis':
            file_name = f'partly_cloudy_smoothed_{scattering_type}_{wavelet}_EMOTION3.csv'
        else:
            file_name = f'partly_cloudy_smoothed_{scattering_type}_{wavelet}_EMOTION3_skip2.csv'
        df = pd.read_csv(os.path.join(RESULTS_DIR, file_name))
        df['wavelet'] = wavelet
        pc_smooth_scattering.append(df)


pc_scat_dfs_full = pd.concat(pc_smooth_scattering, axis = 0).reset_index(drop=True) 
grouped = pc_scat_dfs_full.groupby(['scattering_type', "wavelet",'model'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

partly cloudy EMOTION 3 scattering classification w gaussian smoothing
  scattering_type wavelet model           score
0            blis      W1   MLP  $67.1 \pm 4.3$
1            blis      W2   MLP  $68.3 \pm 3.6$
2         modulus      W1   MLP  $60.6 \pm 4.9$
3         modulus      W2   MLP  $62.3 \pm 5.1$


Process the partly cloudy scattering results

In [13]:
pc_scat_w1 = pd.read_csv(os.path.join(RESULTS_DIR, 'partly_cloudy_results_scattering_1.0_W1.csv'))

print('partly cloudy scattering W1')
grouped = pc_scat_w1.groupby(['scattering_type', 'model'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

partly cloudy scattering W1
  scattering_type model           score
0            blis   MLP  $41.1 \pm 5.0$
1         modulus   MLP  $37.3 \pm 5.7$


In [33]:
# process the partly cloudy results with skip connections 
print("partly cloudy results on EMOTION3, layers 0,1,2,3")
scattering_types = ['blis', 'modulus']
wavelet_types = ['W1', 'W2']
dfs = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        file_name = f'partly_cloudy_{scattering_type}_{wavelet_type}_EMOTION3_skip3.csv'
        df = pd.read_csv(os.path.join(RESULTS_DIR, file_name))
        df['wavelet_type'] = wavelet_type 
        dfs.append(df)

pc_skip3 = pd.concat(dfs, axis = 0).reset_index(drop = True)
grouped = pc_skip3.groupby(['scattering_type', 'wavelet_type'])['score'] 
mean_scores = grouped.mean().reset_index() 
std_deviation = grouped.std().reset_index() 
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

    

partly cloudy results on EMOTION3, layers 0,1,2,3
  scattering_type wavelet_type           score
0            blis           W1  $41.0 \pm 5.4$
1            blis           W2  $41.1 \pm 5.7$
2         modulus           W1  $38.9 \pm 5.0$
3         modulus           W2  $39.5 \pm 5.6$


In [38]:
# process the partly cloudy results with skip connections 
print("partly cloudy results on EMOTION3, layers 0,1,2")
scattering_types = ['blis', 'modulus']
wavelet_types = ['W1', 'W2']
dfs = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        file_name = f'partly_cloudy_{scattering_type}_{wavelet_type}_EMOTION3_skip.csv'
        df = pd.read_csv(os.path.join(RESULTS_DIR, file_name))
        df['wavelet_type'] = wavelet_type 
        dfs.append(df)

pc_skip3 = pd.concat(dfs, axis = 0).reset_index(drop = True)
grouped = pc_skip3.groupby(['scattering_type', 'wavelet_type'])['score'] 
mean_scores = grouped.mean().reset_index() 
std_deviation = grouped.std().reset_index() 
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

    

partly cloudy results on EMOTION3, layers 0,1,2
  scattering_type wavelet_type           score
0            blis           W1  $40.1 \pm 5.5$
1            blis           W2  $40.3 \pm 5.6$
2         modulus           W1  $40.3 \pm 5.3$
3         modulus           W2  $40.7 \pm 5.8$


## Traffic dataset
Now process the traffic dataset

In [14]:
traffic_gnn_03 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GNN_PEMS03.csv'))
traffic_gnn_03_GPS = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GPS_PEMS03.csv'))
traffic_gnn_03 = pd.concat([traffic_gnn_03, traffic_gnn_03_GPS], ignore_index=True)

traffic_gnn_04 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GNN_PEMS04.csv'))
traffic_gnn_04_GPS = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GPS_PEMS04.csv'))
traffic_gnn_04 = pd.concat([traffic_gnn_04, traffic_gnn_04_GPS], ignore_index=True)

traffic_gnn_07 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GNN_PEMS07.csv'))
traffic_gnn_07_GPS = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GPS_PEMS07.csv'))
traffic_gnn_07 = pd.concat([traffic_gnn_07, traffic_gnn_07_GPS], ignore_index=True)

traffic_gnn_08 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GNN_PEMS08.csv'))
traffic_gnn_08_GPS = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_GPS_PEMS08.csv'))
traffic_gnn_08 = pd.concat([traffic_gnn_08, traffic_gnn_08_GPS], ignore_index=True)

traffic_gnn = pd.concat([traffic_gnn_03, traffic_gnn_04, traffic_gnn_07, traffic_gnn_08], ignore_index=True)

print(traffic_gnn.columns)


Index(['sub_dataset', 'model', 'acc', 'stdev', 'hidden_dim', 'learning_rate',
       'task'],
      dtype='object')


In [15]:
grouped = traffic_gnn[traffic_gnn['model']=='GPS'].groupby(['sub_dataset', 'task','model'
                               ])['acc']
mean_scores = grouped.mean().reset_index()
grouped_stdev = traffic_gnn.groupby(['task','sub_dataset', 'model'
                               ])['stdev']
std_deviation = grouped_stdev.mean().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['stdev'] ).map("{:.1f}".format) + "$"
print('traffic GNNs')
print(formatted_results)

traffic GNNs
   sub_dataset  task model              acc
0       PEMS03   DAY   GPS   $49.6 \pm 0.3$
1       PEMS03  HOUR   GPS   $57.4 \pm 0.1$
2       PEMS03  WEEK   GPS   $31.9 \pm 0.4$
3       PEMS04   DAY   GPS   $67.0 \pm 5.3$
4       PEMS04  HOUR   GPS   $66.5 \pm 0.3$
5       PEMS04  WEEK   GPS   $31.7 \pm 0.2$
6       PEMS07   DAY   GPS   $27.7 \pm 1.9$
7       PEMS07  HOUR   GPS   $39.9 \pm 2.7$
8       PEMS07  WEEK   GPS   $30.4 \pm 0.6$
9       PEMS08   DAY   GPS   $67.9 \pm 0.6$
10      PEMS08  HOUR   GPS   $67.7 \pm 0.8$
11      PEMS08  WEEK   GPS  $62.3 \pm 16.7$


In [16]:
# process the traffic scattering results
traffic_day_w2 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_DAY_pca_1.0_layers_3_moments_1_W2.csv'))
traffic_hour_w2 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_HOUR_pca_1.0_layers_3_moments_1_W2.csv'))
traffic_week_w2 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_WEEK_pca_1.0_layers_3_moments_1_W2.csv'))

traffic_day_w1 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_DAY_pca_1.0_layers_3_moments_1_W1.csv')) 
traffic_hour_w1 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_HOUR_pca_1.0_layers_3_moments_1_W1.csv'))
traffic_week_w1 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_WEEK_pca_1.0_layers_3_moments_1_W1.csv'))

In [17]:
# Step 1: Process the new data frames
dataframes_and_tasks = [(traffic_day_w2, 'DAY', 'W2'), 
                        (traffic_hour_w2, 'HOUR', 'W2'), 
                        (traffic_week_w2, 'WEEK', 'W2'),
                        (traffic_day_w1, 'DAY', 'W1'),
                        (traffic_hour_w1, 'HOUR', 'W1'), 
                        (traffic_week_w1, 'WEEK', 'W1')]

for df, task,wavelet_type in dataframes_and_tasks:
    # Add 'task' column
    df['task'] = task
    # Modify 'model' column
    df['model'] = wavelet_type + df['scattering_type'] + df['model']
    # Rename 'score' to 'acc'
    df.rename(columns={'score': 'acc'}, inplace=True)
    df['acc'] = df['acc'] * 100 # rescale to be in percent
    df['stdev'] = df['stdev'] * 100 # rescale to be in percent

# Step 2: Merge the processed data frames with traffic_gnn
# First, retain only the columns you specified for each data frame
cols_to_retain = ['sub_dataset', 'model', 'acc', 'stdev', 'task']
traffic_gnn = traffic_gnn[cols_to_retain]
traffic_day_w2 = traffic_day_w2[cols_to_retain]
traffic_hour_w2 = traffic_hour_w2[cols_to_retain]
traffic_week_w2 = traffic_week_w2[cols_to_retain]
traffic_day_w1 = traffic_day_w1[cols_to_retain]
traffic_hour_w1 = traffic_hour_w1[cols_to_retain]
traffic_week_w1 = traffic_week_w1[cols_to_retain]

# Now, concatenate all data frames to get traffic_full
traffic_full = pd.concat([traffic_gnn, traffic_day_w2, traffic_hour_w2, traffic_week_w2, traffic_day_w1, traffic_hour_w1, traffic_week_w1], ignore_index=True)


In [18]:
# Group by desired columns and compute both the mean and standard deviation
grouped_mean = traffic_full.groupby(['sub_dataset', 'task', 'model'])['acc'].mean().reset_index()
grouped_std = traffic_full.groupby(['sub_dataset', 'task', 'model'])['stdev'].mean().reset_index()

# Format the results
formatted_results = grouped_mean.copy()
formatted_results['acc'] = "$" + (grouped_mean['acc']).map("{:.1f}".format) + \
                            " \pm " + (grouped_std['stdev']).map("{:.1f}".format) + "$"

print('traffic GNNs')
print(formatted_results)


traffic GNNs
   sub_dataset  task         model              acc
0       PEMS03   DAY           GAT   $14.1 \pm 0.3$
1       PEMS03   DAY           GCN   $14.1 \pm 0.1$
2       PEMS03   DAY           GIN   $14.3 \pm 0.4$
3       PEMS03   DAY           GPS   $49.6 \pm 5.3$
4       PEMS03   DAY     W1blisMLP   $53.1 \pm 1.3$
..         ...   ...           ...              ...
91      PEMS08  WEEK           GPS  $62.3 \pm 26.0$
92      PEMS08  WEEK     W1blisMLP   $94.6 \pm 1.3$
93      PEMS08  WEEK  W1modulusMLP   $94.3 \pm 0.5$
94      PEMS08  WEEK     W2blisMLP   $95.4 \pm 1.4$
95      PEMS08  WEEK  W2modulusMLP   $95.4 \pm 1.9$

[96 rows x 4 columns]


In [19]:
print(formatted_results[formatted_results['model'].str.startswith('W1')])


   sub_dataset  task         model             acc
4       PEMS03   DAY     W1blisMLP  $53.1 \pm 1.3$
5       PEMS03   DAY  W1modulusMLP  $51.7 \pm 1.4$
12      PEMS03  HOUR     W1blisMLP  $63.1 \pm 2.2$
13      PEMS03  HOUR  W1modulusMLP  $63.7 \pm 1.1$
20      PEMS03  WEEK     W1blisMLP  $54.8 \pm 1.8$
21      PEMS03  WEEK  W1modulusMLP  $58.0 \pm 1.3$
28      PEMS04   DAY     W1blisMLP  $88.4 \pm 1.6$
29      PEMS04   DAY  W1modulusMLP  $87.6 \pm 0.7$
36      PEMS04  HOUR     W1blisMLP  $83.3 \pm 0.8$
37      PEMS04  HOUR  W1modulusMLP  $83.0 \pm 1.1$
44      PEMS04  WEEK     W1blisMLP  $91.3 \pm 1.4$
45      PEMS04  WEEK  W1modulusMLP  $91.1 \pm 0.9$
52      PEMS07   DAY     W1blisMLP  $72.9 \pm 1.5$
53      PEMS07   DAY  W1modulusMLP  $62.0 \pm 1.5$
60      PEMS07  HOUR     W1blisMLP  $63.5 \pm 1.1$
61      PEMS07  HOUR  W1modulusMLP  $58.6 \pm 0.6$
68      PEMS07  WEEK     W1blisMLP  $76.8 \pm 2.0$
69      PEMS07  WEEK  W1modulusMLP  $66.3 \pm 1.6$
76      PEMS08   DAY     W1blis

In [27]:
# process the traffic data with skip connections 
traffic_day_w2 = pd.read_csv(os.path.join(RESULTS_DIR, 'traffic_results_DAY_pca_1.0_layers_3_moments_1_W2.csv'))

datasets = ['PEMS03', 'PEMS04', 'PEMS07', 'PEMS08']
scattering_types = ['blis', 'modulus']
wavelets = ['W1', 'W2']
df_traffic = []

# HOUR, DAY, WEEK

for dataset in datasets:
    for scattering_type in scattering_types:
        for wavelet in wavelets:
            filename = f'traffic_{dataset}_skip_{scattering_type}_{wavelet}.csv'
            df = pd.read_csv(os.path.join(RESULTS_DIR, filename))
            df['wavelet'] = wavelet 
            df['label'] = ['HOUR', 'DAY', 'WEEK']
            df['score'] = df['score'] * 100 
            df['stdev'] = df['stdev'] * 100
            df_traffic.append(df)

df_traffic = pd.concat(df_traffic, axis = 0).reset_index(drop=True)

grouped_score = df_traffic.groupby(['sub_dataset', 'label', 'scattering_type','wavelet','layer_list'])['score']
grouped_stdev = df_traffic.groupby(['sub_dataset', 'label', 'scattering_type', 'wavelet','layer_list'])['stdev']

mean_scores = grouped_score.mean().reset_index()
mean_stdevs = grouped_stdev.mean().reset_index()

formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score']).map("{:.1f}".format) + \
                             " \pm " + (mean_stdevs['stdev']).map("{:.1f}".format) + "$"

print(formatted_results)



   sub_dataset label scattering_type wavelet layer_list           score
0       PEMS03   DAY            blis      W1      0,1,2  $46.5 \pm 1.0$
1       PEMS03   DAY            blis      W2      0,1,2  $47.9 \pm 1.3$
2       PEMS03   DAY         modulus      W1      0,1,2  $45.6 \pm 0.7$
3       PEMS03   DAY         modulus      W2      0,1,2  $49.5 \pm 0.9$
4       PEMS03  HOUR            blis      W1      0,1,2  $59.5 \pm 0.8$
5       PEMS03  HOUR            blis      W2      0,1,2  $60.8 \pm 2.2$
6       PEMS03  HOUR         modulus      W1      0,1,2  $58.2 \pm 0.8$
7       PEMS03  HOUR         modulus      W2      0,1,2  $60.4 \pm 0.5$
8       PEMS03  WEEK            blis      W1      0,1,2  $46.6 \pm 0.8$
9       PEMS03  WEEK            blis      W2      0,1,2  $51.5 \pm 1.2$
10      PEMS03  WEEK         modulus      W1      0,1,2  $46.4 \pm 1.3$
11      PEMS03  WEEK         modulus      W2      0,1,2  $51.4 \pm 1.0$
12      PEMS04   DAY            blis      W1      0,1,2  $85.1 \

In [30]:
df_blis = df_traffic[df_traffic['scattering_type'] == 'blis'].copy() 
df_modulus = df_traffic[df_traffic['scattering_type'] == 'modulus'].copy()

# Merge on shared columns and calculate difference
merged = pd.merge(df_blis, df_modulus, on=['sub_dataset', 'label', 'wavelet', 'layer_list'], suffixes=('_blis', '_modulus'))
merged['difference'] = merged['score_modulus'] - merged['score_blis']

print(merged[['sub_dataset', 'label', 'wavelet', 'layer_list', 'difference']])



   sub_dataset label wavelet layer_list  difference
0       PEMS03  HOUR      W1      0,1,2   -1.307223
1       PEMS03   DAY      W1      0,1,2   -0.895219
2       PEMS03  WEEK      W1      0,1,2   -0.157681
3       PEMS03  HOUR      W2      0,1,2   -0.386572
4       PEMS03   DAY      W2      0,1,2    1.668362
5       PEMS03  WEEK      W2      0,1,2   -0.050865
6       PEMS04  HOUR      W1      0,1,2   -1.169086
7       PEMS04   DAY      W1      0,1,2   -2.220479
8       PEMS04  WEEK      W1      0,1,2   -3.248333
9       PEMS04  HOUR      W2      0,1,2   -0.972931
10      PEMS04   DAY      W2      0,1,2   -1.890938
11      PEMS04  WEEK      W2      0,1,2   -1.435857
12      PEMS07  HOUR      W1      0,1,2   -5.059046
13      PEMS07   DAY      W1      0,1,2   -7.633444
14      PEMS07  WEEK      W1      0,1,2   -7.184695
15      PEMS07  HOUR      W2      0,1,2   -4.034010
16      PEMS07   DAY      W2      0,1,2   -4.369391
17      PEMS07  WEEK      W2      0,1,2   -5.578649
18      PEMS

In [34]:
# now do the same thing as above but for the layers 0,1,2,3 on traffic 
print('results for traffic, skip 3')

datasets = ['PEMS03', 'PEMS04', 'PEMS07', 'PEMS08']
scattering_types = ['blis', 'modulus']
wavelets = ['W1', 'W2']
df_traffic = []

# HOUR, DAY, WEEK

for dataset in datasets:
    for scattering_type in scattering_types:
        for wavelet in wavelets:
            filename = f'traffic_{dataset}_skip3_{scattering_type}_{wavelet}.csv'
            df = pd.read_csv(os.path.join(RESULTS_DIR, filename))
            df['wavelet'] = wavelet 
            df['label'] = ['HOUR', 'DAY', 'WEEK']
            df['score'] = df['score'] * 100 
            df['stdev'] = df['stdev'] * 100
            df_traffic.append(df)

df_traffic = pd.concat(df_traffic, axis = 0).reset_index(drop=True)

grouped_score = df_traffic.groupby(['sub_dataset', 'label', 'scattering_type','wavelet','layer_list'])['score']
grouped_stdev = df_traffic.groupby(['sub_dataset', 'label', 'scattering_type', 'wavelet','layer_list'])['stdev']

mean_scores = grouped_score.mean().reset_index()
mean_stdevs = grouped_stdev.mean().reset_index()

formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score']).map("{:.1f}".format) + \
                             " \pm " + (mean_stdevs['stdev']).map("{:.1f}".format) + "$"

print(formatted_results)


results for traffic, skip 3
   sub_dataset label scattering_type wavelet layer_list           score
0       PEMS03   DAY            blis      W1    0,1,2,3  $52.2 \pm 2.0$
1       PEMS03   DAY            blis      W2    0,1,2,3  $55.9 \pm 2.6$
2       PEMS03   DAY         modulus      W1    0,1,2,3  $55.7 \pm 0.6$
3       PEMS03   DAY         modulus      W2    0,1,2,3  $62.1 \pm 0.9$
4       PEMS03  HOUR            blis      W1    0,1,2,3  $64.3 \pm 1.3$
5       PEMS03  HOUR            blis      W2    0,1,2,3  $66.7 \pm 2.7$
6       PEMS03  HOUR         modulus      W1    0,1,2,3  $65.0 \pm 2.2$
7       PEMS03  HOUR         modulus      W2    0,1,2,3  $69.4 \pm 1.2$
8       PEMS03  WEEK            blis      W1    0,1,2,3  $54.6 \pm 0.9$
9       PEMS03  WEEK            blis      W2    0,1,2,3  $61.9 \pm 3.1$
10      PEMS03  WEEK         modulus      W1    0,1,2,3  $60.3 \pm 0.5$
11      PEMS03  WEEK         modulus      W2    0,1,2,3  $65.5 \pm 2.5$
12      PEMS04   DAY            blis

In [35]:
print('modulus - blis for skip 3')
df_blis = df_traffic[df_traffic['scattering_type'] == 'blis'].copy() 
df_modulus = df_traffic[df_traffic['scattering_type'] == 'modulus'].copy()

# Merge on shared columns and calculate difference
merged = pd.merge(df_blis, df_modulus, on=['sub_dataset', 'label', 'wavelet', 'layer_list'], suffixes=('_blis', '_modulus'))
merged['difference'] = merged['score_modulus'] - merged['score_blis']

print(merged[['sub_dataset', 'label', 'wavelet', 'layer_list', 'difference']])


modulus - blis for skip 3
   sub_dataset label wavelet layer_list  difference
0       PEMS03  HOUR      W1    0,1,2,3    0.686673
1       PEMS03   DAY      W1    0,1,2,3    3.540183
2       PEMS03  WEEK      W1    0,1,2,3    5.752798
3       PEMS03  HOUR      W2    0,1,2,3    2.690743
4       PEMS03   DAY      W2    0,1,2,3    6.215666
5       PEMS03  WEEK      W2    0,1,2,3    3.631740
6       PEMS04  HOUR      W1    0,1,2,3    0.447234
7       PEMS04   DAY      W1    0,1,2,3    1.318164
8       PEMS04  WEEK      W1    0,1,2,3    1.812475
9       PEMS04  HOUR      W2    0,1,2,3    2.385249
10      PEMS04   DAY      W2    0,1,2,3    2.487250
11      PEMS04  WEEK      W2    0,1,2,3    0.298156
12      PEMS07  HOUR      W1    0,1,2,3   -1.676901
13      PEMS07   DAY      W1    0,1,2,3   -6.013226
14      PEMS07  WEEK      W1    0,1,2,3   -5.687293
15      PEMS07  HOUR      W2    0,1,2,3    1.048654
16      PEMS07   DAY      W2    0,1,2,3    3.585262
17      PEMS07  WEEK      W2    0,1,2,

# Shallow Classifiers Full

In [2]:
RESULTS_DIR = 'results_shallow'

# read in and process the partly_cloudy datasets 
scattering_types = ['blis', 'modulus']
wavelet_types = ['W1', 'W2']

partly_cloudy_shallow = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, f'partly_cloudy_smoothed_{scattering_type}_{wavelet_type}_EMOTION3_shallow.csv'))
        df["wavelet_type"] = wavelet_type
        partly_cloudy_shallow.append(df) 

partly_cloudy_shallow = pd.concat(partly_cloudy_shallow, axis = 0).reset_index(drop=True)
partly_cloudy_shallow

Unnamed: 0,scattering_type,sub_dataset,model,score,stdev,ncomp,task,pca_var,moment_list,layer_list,wavelet_type
0,blis,0,XGB,0.576923,0.114095,-1.0,EMOTION3,1.0,1,3,W1
1,blis,0,RF,0.623077,0.167473,-1.0,EMOTION3,1.0,1,3,W1
2,blis,0,SVC,0.600000,0.127794,-1.0,EMOTION3,1.0,1,3,W1
3,blis,0,LR,0.715385,0.082849,-1.0,EMOTION3,1.0,1,3,W1
4,blis,1,XGB,0.646154,0.122595,-1.0,EMOTION3,1.0,1,3,W1
...,...,...,...,...,...,...,...,...,...,...,...
2475,modulus,153,LR,0.507692,0.056527,-1.0,EMOTION3,1.0,1,012,W2
2476,modulus,154,XGB,0.638462,0.086346,-1.0,EMOTION3,1.0,1,012,W2
2477,modulus,154,RF,0.615385,0.130995,-1.0,EMOTION3,1.0,1,012,W2
2478,modulus,154,SVC,0.523077,0.018842,-1.0,EMOTION3,1.0,1,012,W2


In [3]:
print('partly cloudy scattering shallow')
grouped = partly_cloudy_shallow.groupby(['scattering_type', 'model', 'wavelet_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

partly cloudy scattering shallow
   scattering_type model wavelet_type           score
0             blis    LR           W1  $62.4 \pm 5.4$
1             blis    LR           W2  $65.9 \pm 5.2$
2             blis    RF           W1  $61.5 \pm 5.2$
3             blis    RF           W2  $63.0 \pm 4.5$
4             blis   SVC           W1  $56.2 \pm 5.3$
5             blis   SVC           W2  $59.0 \pm 5.0$
6             blis   XGB           W1  $61.1 \pm 5.7$
7             blis   XGB           W2  $62.8 \pm 5.1$
8          modulus    LR           W1  $51.2 \pm 5.8$
9          modulus    LR           W2  $53.1 \pm 5.9$
10         modulus    RF           W1  $56.1 \pm 6.0$
11         modulus    RF           W2  $58.8 \pm 5.5$
12         modulus   SVC           W1  $51.5 \pm 5.9$
13         modulus   SVC           W2  $54.2 \pm 6.1$
14         modulus   XGB           W1  $56.2 \pm 6.0$
15         modulus   XGB           W2  $58.3 \pm 5.8$


In [4]:
print('synthetic scattering shallow')
# read in and process the partly_cloudy datasets 
scattering_types = ['blis', 'modulus']
wavelet_types = ['W1', 'W2']

synthetic_shallow = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, f'synthetic_results_{scattering_type}_{wavelet_type}_shallow.csv'))
        df["wavelet_type"] = wavelet_type
        # convert sub_dataset to a task label
        df['task'] = np.where(df['sub_dataset'].str.startswith('camel_pm'), 'same mu',
                            np.where(df['sub_dataset'].str.startswith('gaussian_pm'), 'different mu', np.nan))
        synthetic_shallow.append(df) 

synthetic_shallow = pd.concat(synthetic_shallow, axis = 0).reset_index(drop=True)
synthetic_shallow


synthetic scattering shallow


Unnamed: 0,scattering_type,sub_dataset,model,score,stdev,ncomp,pca_var,layer_list,wavelet_type,task
0,blis,camel_pm_0,XGB,0.986667,0.006667,-1.0,1.0,3,W1,same mu
1,blis,camel_pm_0,RF,0.986667,0.012472,-1.0,1.0,3,W1,same mu
2,blis,camel_pm_0,SVC,0.983333,0.018257,-1.0,1.0,3,W1,same mu
3,blis,camel_pm_0,LR,0.966667,0.014907,-1.0,1.0,3,W1,same mu
4,blis,camel_pm_1,XGB,0.983333,0.018257,-1.0,1.0,3,W1,same mu
...,...,...,...,...,...,...,...,...,...,...
155,modulus,gaussian_pm_3,LR,0.846667,0.043970,-1.0,1.0,12,W2,different mu
156,modulus,gaussian_pm_4,XGB,0.786667,0.035590,-1.0,1.0,12,W2,different mu
157,modulus,gaussian_pm_4,RF,0.703333,0.038586,-1.0,1.0,12,W2,different mu
158,modulus,gaussian_pm_4,SVC,0.853333,0.053125,-1.0,1.0,12,W2,different mu


In [5]:
print('synthetic scattering shallow')
grouped = synthetic_shallow.groupby(['task','scattering_type', 'model', 'wavelet_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)

synthetic scattering shallow
            task scattering_type model wavelet_type            score
0   different mu            blis    LR           W1  $100.0 \pm 0.0$
1   different mu            blis    LR           W2  $100.0 \pm 0.0$
2   different mu            blis    RF           W1   $99.2 \pm 0.4$
3   different mu            blis    RF           W2   $99.4 \pm 0.1$
4   different mu            blis   SVC           W1   $99.4 \pm 0.1$
5   different mu            blis   SVC           W2  $100.0 \pm 0.0$
6   different mu            blis   XGB           W1   $99.5 \pm 0.3$
7   different mu            blis   XGB           W2   $99.3 \pm 0.0$
8   different mu         modulus    LR           W1   $97.7 \pm 0.7$
9   different mu         modulus    LR           W2   $86.9 \pm 4.9$
10  different mu         modulus    RF           W1   $95.9 \pm 1.6$
11  different mu         modulus    RF           W2   $73.4 \pm 8.0$
12  different mu         modulus   SVC           W1   $98.1 \pm 0.9$
13  d

In [6]:
# do the same for the traffic dataset
DATASET = 'PEMS03'

traffic_shallow = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, f'traffic_{DATASET}_{scattering_type}_{wavelet_type}_shallow.csv'))
        df["wavelet_type"] = wavelet_type 
        traffic_shallow.append(df)

traffic_shallow = pd.concat(traffic_shallow, axis = 0).reset_index(drop=True)

print(f'traffic scattering {DATASET} shallow')
grouped = traffic_shallow.groupby(['task','scattering_type', 'model', 'wavelet_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)


traffic scattering PEMS03 shallow
    task scattering_type model wavelet_type           score
0    DAY            blis    LR           W1  $37.0 \pm nan$
1    DAY            blis    LR           W2  $42.2 \pm nan$
2    DAY            blis    RF           W1  $52.3 \pm nan$
3    DAY            blis    RF           W2  $53.4 \pm nan$
4    DAY            blis   SVC           W1  $35.1 \pm nan$
5    DAY            blis   SVC           W2  $35.9 \pm nan$
6    DAY            blis   XGB           W1  $54.0 \pm nan$
7    DAY            blis   XGB           W2  $56.3 \pm nan$
8    DAY         modulus    LR           W1  $33.1 \pm nan$
9    DAY         modulus    LR           W2  $33.2 \pm nan$
10   DAY         modulus    RF           W1  $44.8 \pm nan$
11   DAY         modulus    RF           W2  $46.6 \pm nan$
12   DAY         modulus   SVC           W1  $28.3 \pm nan$
13   DAY         modulus   SVC           W2  $30.5 \pm nan$
14   DAY         modulus   XGB           W1  $42.8 \pm nan$
15   D

In [7]:
# do the same for the traffic dataset
DATASET = 'PEMS07'

traffic_shallow = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, f'traffic_{DATASET}_{scattering_type}_{wavelet_type}_shallow.csv'))
        df["wavelet_type"] = wavelet_type 
        traffic_shallow.append(df)

traffic_shallow = pd.concat(traffic_shallow, axis = 0).reset_index(drop=True)

print(f'traffic scattering {DATASET} shallow')
grouped = traffic_shallow.groupby(['task','scattering_type', 'model', 'wavelet_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)


traffic scattering PEMS07 shallow
    task scattering_type model wavelet_type           score
0    DAY            blis    LR           W1  $46.2 \pm nan$
1    DAY            blis    LR           W2  $41.0 \pm nan$
2    DAY            blis    RF           W1  $67.7 \pm nan$
3    DAY            blis    RF           W2  $62.7 \pm nan$
4    DAY            blis   SVC           W1  $53.7 \pm nan$
5    DAY            blis   SVC           W2  $41.8 \pm nan$
6    DAY            blis   XGB           W1  $74.7 \pm nan$
7    DAY            blis   XGB           W2  $66.5 \pm nan$
8    DAY         modulus    LR           W1  $33.3 \pm nan$
9    DAY         modulus    LR           W2  $29.9 \pm nan$
10   DAY         modulus    RF           W1  $51.9 \pm nan$
11   DAY         modulus    RF           W2  $53.4 \pm nan$
12   DAY         modulus   SVC           W1  $35.5 \pm nan$
13   DAY         modulus   SVC           W2  $34.7 \pm nan$
14   DAY         modulus   XGB           W1  $50.2 \pm nan$
15   D

In [10]:
# do the same for the traffic dataset
DATASET = 'PEMS04'

# note that the MLP results needed to be re-run, so I will include them here too

traffic_shallow = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, f'traffic_{DATASET}_{scattering_type}_{wavelet_type}_shallow.csv'))
        df["wavelet_type"] = wavelet_type 
        traffic_shallow.append(df)

        df_MLP = pd.read_csv(os.path.join(RESULTS_DIR, f'traffic_{DATASET}_{scattering_type}_{wavelet_type}_MLP.csv'))
        df_MLP["wavelet_type"] = wavelet_type 
        traffic_shallow.append(df_MLP)



traffic_shallow = pd.concat(traffic_shallow, axis = 0).reset_index(drop=True)

print(f'traffic scattering {DATASET} shallow')
grouped = traffic_shallow.groupby(['task','scattering_type', 'model', 'wavelet_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)


traffic scattering PEMS04 shallow
    task scattering_type model wavelet_type           score
0    DAY            blis    LR           W1  $71.4 \pm nan$
1    DAY            blis    LR           W2  $69.4 \pm nan$
2    DAY            blis   MLP           W1  $87.8 \pm nan$
3    DAY            blis   MLP           W2  $91.9 \pm nan$
4    DAY            blis    RF           W1  $89.8 \pm nan$
5    DAY            blis    RF           W2  $88.5 \pm nan$
6    DAY            blis   SVC           W1  $75.5 \pm nan$
7    DAY            blis   SVC           W2  $73.5 \pm nan$
8    DAY            blis   XGB           W1  $93.9 \pm nan$
9    DAY            blis   XGB           W2  $92.8 \pm nan$
10   DAY         modulus    LR           W1  $47.7 \pm nan$
11   DAY         modulus    LR           W2  $49.8 \pm nan$
12   DAY         modulus   MLP           W1  $83.2 \pm nan$
13   DAY         modulus   MLP           W2  $85.9 \pm nan$
14   DAY         modulus    RF           W1  $79.0 \pm nan$
15   D

In [11]:
# do the same for the traffic dataset
DATASET = 'PEMS08'

traffic_shallow = []
for scattering_type in scattering_types:
    for wavelet_type in wavelet_types:
        df = pd.read_csv(os.path.join(RESULTS_DIR, f'traffic_{DATASET}_{scattering_type}_{wavelet_type}_shallow.csv'))
        df["wavelet_type"] = wavelet_type 
        traffic_shallow.append(df)

        df_MLP = pd.read_csv(os.path.join(RESULTS_DIR, f'traffic_{DATASET}_{scattering_type}_{wavelet_type}_MLP.csv'))
        df_MLP["wavelet_type"] = wavelet_type 
        traffic_shallow.append(df_MLP)

traffic_shallow = pd.concat(traffic_shallow, axis = 0).reset_index(drop=True)

print(f'traffic scattering {DATASET} shallow')
grouped = traffic_shallow.groupby(['task','scattering_type', 'model', 'wavelet_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)


traffic scattering PEMS08 shallow
    task scattering_type model wavelet_type           score
0    DAY            blis    LR           W1  $76.5 \pm nan$
1    DAY            blis    LR           W2  $80.2 \pm nan$
2    DAY            blis   MLP           W1  $92.9 \pm nan$
3    DAY            blis   MLP           W2  $94.9 \pm nan$
4    DAY            blis    RF           W1  $92.7 \pm nan$
5    DAY            blis    RF           W2  $93.5 \pm nan$
6    DAY            blis   SVC           W1  $85.2 \pm nan$
7    DAY            blis   SVC           W2  $87.4 \pm nan$
8    DAY            blis   XGB           W1  $95.1 \pm nan$
9    DAY            blis   XGB           W2  $96.0 \pm nan$
10   DAY         modulus    LR           W1  $56.1 \pm nan$
11   DAY         modulus    LR           W2  $60.1 \pm nan$
12   DAY         modulus   MLP           W1  $89.9 \pm nan$
13   DAY         modulus   MLP           W2  $92.0 \pm nan$
14   DAY         modulus    RF           W1  $86.0 \pm nan$
15   D

# Rebuttal experiments

In [6]:
import glob 

# first process all of the GAT data

results_dir = 'run_results'
gat_files = glob.glob(os.path.join(results_dir, '*GAT*.csv'))  # Find all files containing 'GAT'

dataframes = []
for file in gat_files:
    df = pd.read_csv(file)  # Assuming the files are in CSV format
    dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)
print("Combined DataFrame:")
print(df.head())



Combined DataFrame:
  model  hidden_dim  epochs  learning_rate task_type        dataset  \
0   GAT          16     100          0.001  EMOTION3  partly_cloudy   
1   GAT          16     100          0.001  EMOTION3  partly_cloudy   
2   GAT          16     100          0.001  EMOTION3  partly_cloudy   
3   GAT          16     100          0.001  EMOTION3  partly_cloudy   
4   GAT          16     100          0.001  EMOTION3  partly_cloudy   

  sub_dataset      score     stdev  
0           0  28.627451  4.401950  
1           1  38.823529  5.736760  
2           2  48.235294  5.628510  
3           3  40.392157  6.746922  
4           4  29.019608  2.286648  


In [26]:
GAT_partly_cloudy_mean = df[df['dataset'] == 'partly_cloudy' ]['score'].mean()
GAT_partly_cloudy_stdev = df[df['dataset'] == 'partly_cloudy' ]['score'].std()
GAT_camel_mean = df[df['sub_dataset'].str.startswith('camel', na = False)]['score'].mean()
GAT_camel_stdev = df[df['sub_dataset'].str.startswith('camel', na = False)]['score'].std()
GAT_gaussian_mean = df[df['sub_dataset'].str.startswith('gaussian', na = False)]['score'].mean()
GAT_gaussian_stdev = df[df['sub_dataset'].str.startswith('gaussian', na = False)]['score'].std()
print(f'GAT Partly Cloudy Result: ${GAT_partly_cloudy_mean:.1f} \pm {GAT_partly_cloudy_stdev:.1f} $')
print(f'GAT Synthetic camel Result: ${GAT_camel_mean:.1f} \pm {GAT_camel_stdev:.1f} $')
print(f'GAT synthetic gaussian result: ${GAT_gaussian_mean:.1f} \pm {GAT_gaussian_stdev:.1f} $')


GAT Partly Cloudy Result: $40.6 \pm 6.1 $
GAT Synthetic camel Result: $96.4 \pm 0.6 $
GAT synthetic gaussian result: $98.6 \pm 0.8 $


In [28]:
# GAT traffic result
df[df['sub_dataset'] == 'PEMS07']

Unnamed: 0,model,hidden_dim,epochs,learning_rate,task_type,dataset,sub_dataset,score,stdev
171,GAT,16,100,0.001,DAY,traffic,PEMS07,22.187057,1.159908
172,GAT,16,100,0.001,HOUR,traffic,PEMS07,33.160132,1.252239
173,GAT,16,100,0.001,WEEK,traffic,PEMS07,36.506849,0.857303


In [34]:
# now look at how well the new models perform on the datasets:

new_model_files = glob.glob(os.path.join(results_dir, '*multi-model*'))

df_list = []
for f in new_model_files:
    df = pd.read_csv(f)
    df_list.append(df)

df = pd.concat(df_list, ignore_index = True)
df.head()

grouped = df.groupby(['model', 'dataset', 'sub_dataset', 'task_type'])['score']
mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score']).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score']).map("{:.1f}".format) + "$"
print(formatted_results)

      model    dataset    sub_dataset  task_type           score
0   ChebNet  synthetic     camel_pm_0  PLUSMINUS  $98.3 \pm nan$
1   ChebNet  synthetic     camel_pm_1  PLUSMINUS  $97.2 \pm nan$
2   ChebNet  synthetic     camel_pm_2  PLUSMINUS  $96.8 \pm nan$
3   ChebNet  synthetic     camel_pm_3  PLUSMINUS  $97.5 \pm nan$
4   ChebNet  synthetic     camel_pm_4  PLUSMINUS  $96.7 \pm nan$
5   ChebNet  synthetic  gaussian_pm_0  PLUSMINUS  $99.5 \pm nan$
6   ChebNet  synthetic  gaussian_pm_1  PLUSMINUS  $98.8 \pm nan$
7   ChebNet  synthetic  gaussian_pm_2  PLUSMINUS  $99.2 \pm nan$
8   ChebNet  synthetic  gaussian_pm_3  PLUSMINUS  $98.8 \pm nan$
9   ChebNet  synthetic  gaussian_pm_4  PLUSMINUS  $99.0 \pm nan$
10  ChebNet    traffic         PEMS03        DAY  $24.6 \pm nan$
11  ChebNet    traffic         PEMS03       HOUR  $50.9 \pm nan$
12  ChebNet    traffic         PEMS03       WEEK  $56.1 \pm nan$
13  ChebNet    traffic         PEMS04        DAY  $50.5 \pm nan$
14  ChebNet    traffic   

In [42]:
# check the shallow classifier results

# partly cloudy and synthetic first
MLP_results_files = glob.glob(os.path.join(results_dir, '*W12*'))

df_list = []
for f in MLP_results_files:
    df = pd.read_csv(f)
    df_list.append(df)

df = pd.concat(df_list, ignore_index=True)
print(df.head())


df_pc = df[df['dataset'] == 'partly_cloudy']
df_pc_grouped = df_pc.groupby(['model','scattering_type', 'largest_scale', 'layer_list', 'moment_list', 'wavelet_type'])['score']
mean_scores = df_pc_grouped.mean().reset_index()
std_deviation = df_pc_grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                            " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"
print(formatted_results)


  scattering_type sub_dataset model     score     stdev  ncomp      task  \
0            blis           0   MLP  0.715385  0.115641   -1.0  EMOTION3   
1            blis           1   MLP  0.661538  0.117670   -1.0  EMOTION3   
2            blis           2   MLP  0.723077  0.074580   -1.0  EMOTION3   
3            blis           3   MLP  0.684615  0.044853   -1.0  EMOTION3   
4            blis           4   MLP  0.723077  0.085658   -1.0  EMOTION3   

   pca_var  moment_list layer_list wavelet_type        dataset  largest_scale  
0        1            1      0,1,2           W1  partly_cloudy              4  
1        1            1      0,1,2           W1  partly_cloudy              4  
2        1            1      0,1,2           W1  partly_cloudy              4  
3        1            1      0,1,2           W1  partly_cloudy              4  
4        1            1      0,1,2           W1  partly_cloudy              4  
  model scattering_type  largest_scale layer_list  moment_list 

# Old stuff below

In [25]:
grouped = gaussian_dataset.groupby(['scattering_type', 'model'])['score']

mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                             " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"

print(formatted_results)


  scattering_type model            score
0            blis   KNN   $97.5 \pm 1.0$
1            blis   MLP   $99.5 \pm 0.4$
2            blis    RF   $98.1 \pm 0.7$
3            blis   SVC  $100.0 \pm 0.0$
4            blis   XGB   $97.7 \pm 0.7$
5         modulus   KNN   $45.7 \pm 5.4$
6         modulus   MLP   $55.2 \pm 6.7$
7         modulus    RF   $40.2 \pm 8.3$
8         modulus   SVC   $56.2 \pm 3.4$
9         modulus   XGB   $45.9 \pm 7.3$


In [28]:
# process the camel dataset

camel_dataset = df_s1[df_s1['sub_dataset'].str.startswith("camel_pm")]
grouped = camel_dataset.groupby(['scattering_type', 'model'])['score']

mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['score'] = "$" + (formatted_results['score'] * 100).map("{:.1f}".format) + \
                             " \pm " + (std_deviation['score'] * 100).map("{:.1f}".format) + "$"

print(formatted_results)

  scattering_type model           score
0            blis   KNN  $93.4 \pm 1.6$
1            blis   MLP  $98.9 \pm 0.5$
2            blis    RF  $97.5 \pm 0.6$
3            blis   SVC  $97.7 \pm 0.1$
4            blis   XGB  $97.3 \pm 0.1$
5         modulus   KNN  $93.4 \pm 1.6$
6         modulus   MLP  $97.4 \pm 0.1$
7         modulus    RF  $96.3 \pm 0.8$
8         modulus   SVC  $97.4 \pm 0.1$
9         modulus   XGB  $95.4 \pm 0.9$


In [29]:
df_sgnn = pd.read_csv('synthetic_GNN.csv')
df_sgnn.columns

Index(['sub_dataset', 'model', 'acc', 'stdev', 'hidden_dim', 'learning_rate'], dtype='object')

In [33]:
gaussian_dataset = df_sgnn[df_sgnn['sub_dataset'].str.startswith("gaussian_pm")]
grouped = gaussian_dataset.groupby(['model'])['acc']

mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                             " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"

print(formatted_results)

  model              acc
0   GAT  $100.0 \pm 0.0$
1   GCN  $100.0 \pm 0.0$
2   GIN  $100.0 \pm 0.1$


In [34]:
camel_dataset = df_sgnn[df_sgnn['sub_dataset'].str.startswith("camel_pm")]
grouped = camel_dataset.groupby(['model'])['acc']

mean_scores = grouped.mean().reset_index()
std_deviation = grouped.std().reset_index()

# Convert to percentages (without the percent sign) and format
formatted_results = mean_scores.copy()
formatted_results['acc'] = "$" + (formatted_results['acc'] ).map("{:.1f}".format) + \
                             " \pm " + (std_deviation['acc'] ).map("{:.1f}".format) + "$"

print(formatted_results)

  model             acc
0   GAT  $94.6 \pm 0.4$
1   GCN  $94.4 \pm 0.3$
2   GIN  $88.9 \pm 1.5$


In [None]:
model = 'MLP'
# Filter the DataFrame for only rows where the model is 'MLP'
model_df = df[df['model'] == model]

# Compute the average score for 'blis' scattering type
blis_avg_score = model_df[model_df['scattering_type'] == 'blis']['score'].mean()
print(f"Average score for 'blis' scattering type with {model} model: {blis_avg_score:.4f}")

blis_avg_stdev = model_df[model_df['scattering_type'] == 'blis']['stdev'].mean()
print(f"Average standard deviation for 'blis' scattering type with {model} model: {blis_avg_stdev:.4f}")

# Compute the average score for 'modulus' scattering type
modulus_avg_score = model_df[model_df['scattering_type'] == 'modulus']['score'].mean()
print(f"Average score for 'modulus' scattering type with {model} model: {modulus_avg_score:.4f}")

modulus_avg_stdev = model_df[model_df['scattering_type'] == 'modulus']['stdev'].mean()
print(f"Average standard deviation for 'modulus' scattering type with {model} model: {modulus_avg_stdev:.4f}")

print(f'Difference is {blis_avg_score - modulus_avg_score}')
print(f'Fractional improvement is {(blis_avg_score - modulus_avg_score)/modulus_avg_score}')

direct_improvements = []
for sub_dataset in range(155):
    blis_score = model_df[(model_df['scattering_type'] == 'blis') & (model_df['sub_dataset'] == sub_dataset)]['score']
    modulus_score = model_df[(model_df['scattering_type'] == 'modulus') & (model_df['sub_dataset'] == sub_dataset)]['score']
    diff = blis_score.item() - modulus_score.item()
    direct_improvements.append(diff)

direct_improvements = np.array(direct_improvements)