In [None]:
import pandas as pd
import re
from glob import glob
import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

from sklearn.base import clone
from copy import deepcopy

import numpy as np

In [None]:
bin_size = 10 # size of bins to consider

# expressions to identify column types
num_expr = re.compile("infections_(\d+)_to_(\d+)")
out_degree_expr = re.compile("^out_degree_(\d+)_to_(\d+)")
bdry_expr = re.compile("boundary_out_degree_(\d+)_to_(\d+)")
path_ct_expr = re.compile("cnt_*")
avg_ct_expr = re.compile("avg_*")

In [None]:
# Identifies which columns correspond to which bucket sizes
# We only use one bucket size - experiments found little difference with increased granularity
def is_bucket(str_val, expr, size):
    matches = expr.match(str_val)
    if not matches:
        return False
    
    lb = int(matches[1])
    ub = int(matches[2])
    
    return ub - lb == size

In [None]:
def train_eval_model(data, bin_size, estimator, path_expr, col_set=None, n_iter=10):
    
    """
    do 10 train/evaluate splits using the specified data, and set of
    columns
    
    data - the aggregate data where each row is a cascade
    bin_size - specifies the subset of binned columns to use (e.g. 5, 10)
    estimator - an instantiated SkLearn classifier 
    path_expr - a regex indicating whether to use avg or count for labeled paths
    col_set - a tuple indicating which sets of columns to use
    
    returns - a list of 10 (accuracy, trained_estimator) tuples
    """
    
    inf_cols = data.columns[data.columns.map(lambda x: is_bucket(x, num_expr, bin_size))]
    path_cols = data.filter(regex=path_expr).columns
    
    # get all other columns
    out_degree_expr = re.compile("^out_degree_(\d+)_to_(\d+)")
    path_len_cols = data.filter(regex="^path_len_*").columns
    out_degree_cols = data.columns[data.columns.map(lambda x: is_bucket(x, out_degree_expr, 2))]

    feature_data = data[inf_cols.union(path_cols).union(path_len_cols).union(out_degree_cols)].copy()

    norm_cols = inf_cols.union(out_degree_cols).union(path_len_cols)
    norm_sums = data[norm_cols].sum(axis=1).values
    
    feature_data.loc[norm_sums != 0, norm_cols] = feature_data[norm_cols].divide(norm_sums, axis=0)[norm_sums != 0]
    
    if col_set:
        cols = pd.Index([])
        if "inf_cols" in col_set:
            cols = cols.union(inf_cols)
        if "labeled_path_1" in col_set:
            labeled_path_1_ind = ~data.filter(regex=path_expr).columns.str.contains("-")
            cols = cols.union(data.filter(regex=path_expr).columns[labeled_path_1_ind])
        if "labeled_path_2" in col_set:
            labeled_path_2_ind = data.filter(regex=path_expr).columns.str.contains("-")
            cols = cols.union(data.filter(regex=path_expr).columns[labeled_path_2_ind])
        if "out_degree" in col_set:
            cols = cols.union(out_degree_cols)
        if "path_len" in col_set:
            cols = cols.union(path_len_cols)
        feature_data = feature_data[cols]

    runs = []
    
    for i in range(n_iter):
        X_train, X_test, y_train, y_test = train_test_split(feature_data, 
                                                        data["label_scenario"], shuffle=True, 
                                                        stratify=data["label_scenario"], test_size=0.25)
        estimator.fit(X_train, y_train)
        acc = estimator.score(X_test, y_test)
        
        est_arch = deepcopy(estimator)
      
        runs.append((acc, est_arch))
        #print(f"Iteration {i}, {acc}")

    return runs

In [None]:
col_combos = [
    ("inf_cols", "path_len", "out_degree", "labeled_path_1", "labeled_path_2"), # Epicurve + structure
    ("inf_cols",) # Epicurve only
]

# whether to use count or average, determined via exploratory testing
lr_path_expr = "cnt_en_*" 
rf_path_expr = "avg_en_*"
svm_path_expr = "avg_s_en_*"


lr = make_pipeline(LogisticRegression(max_iter=10000))
gr_lr = GridSearchCV(lr, param_grid={
            "logisticregression__C" : np.arange(0.1, 3.0, 0.05)
        }, n_jobs=4, cv=5)

cv_rf = GridSearchCV(RandomForestClassifier(), param_grid={
    "max_features" : ["sqrt", "log2"] + list(range(1,40)),
}, n_jobs=4, cv=5)

cv_svm = GridSearchCV(make_pipeline(StandardScaler(), SVC(kernel="linear", probability=True)), param_grid={
    "svc__C" : [1e-2, 1e-1, 1.0, 1e1],    
}, n_jobs=4, cv=5)

In [None]:
# reading in example data

data_file = "../ml_table/exp5_T70_features.csv"
data = pd.read_csv(data_file)

In [None]:
full_lr = train_eval_model(data, 10, gr_lr, re.compile(lr_path_expr), col_set=col_combos[0], n_iter=4)
epi_lr = train_eval_model(data, 10, gr_lr, re.compile(lr_path_expr), col_set=col_combos[1], n_iter=4)

In [None]:
full_rf = train_eval_model(data, 10, cv_rf, re.compile(rf_path_expr), col_set=col_combos[0], n_iter=4)
epi_rf = train_eval_model(data, 10, cv_rf, re.compile(rf_path_expr), col_set=col_combos[0], n_iter=4)

In [None]:
full_svm = train_eval_model(data, 10, cv_svm, re.compile(svm_path_expr), col_set=col_combos[0], n_iter=4)
epi_svm = train_eval_model(data, 10, cv_svm, re.compile(svm_path_expr), col_set=col_combos[1], n_iter=4)