# Experiments with k-fold cross validation

I shall run the experiments with K-fold CV.

I can make the dataset into subjects and then use sklearns function to partition the subjects, but then the folds may vary with size, and there may be an unbalence

In [1]:
# import statements
import json
import os
import itertools
import numpy as np
import random
from tqdm import tqdm
from sklearn.model_selection import KFold
from models import LR, svc_sigmoid, svc_rbf, svc_linear, NN, get_tf_idf_features, get_BERT_features

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

# List models, and representations

In [2]:
# create list of models
models = {"LR": LR, "svc_sigmoid":svc_sigmoid, "svc_rbf":svc_rbf, "svc_linear":svc_linear, "NN":NN}

In [3]:
# create list of representations
features = {"TF_IDF": get_tf_idf_features, "BERT":get_BERT_features}

# Load data

In [4]:
# data filename
filename = "data_1"

In [5]:
# load data
with open(os.path.join(filename, "cleaned_data"), 'r') as f:
    data = json.load(f)

In [6]:
# print data
data

{'non_violent': {'1114': [{'0': 'my sole desire in my heart is to join al-shabaab in establishing the khilafa . it is so engraved upon my heart that i began lamenting the time it is going to take to raise the money or means and then travel to somalia . i don ’ t know how to get there , so that is an issue . another one is based on things i look up and post on the internet . i am worried that i will be arrested when i try to leave the country . does anyone know how to help me ?'},
   {'1': 'the protest is going to be all muslim . do not invite the kafir [ nonbeliever ] friends that you should not have .'},
   {'2': 'there are four basic mediums through which one can transmit information effectively : video , audio , still-images , and writing . in addition to this there are two fields through which one can spread this information : the real world and the virtual world . every single method needs to be used and used well . signed xxxxxxxx [ 1114 ] , how to propagate & call to jihad , feb

# Compute all possible combinations

In [7]:
#  first:for each subject, create list of every possible combination of num posts
# then randomly select the (number of original posts * add_prop) combinations to add to examples
def get_APC(data, num):
    # for each subject
    new_data = list()
    for i, subject in enumerate(data):
        combinations = list(itertools.combinations(subject, num))
        new_data.append([" <SEP> ".join(c) for c in combinations])
        #print(len(data[i]), len(new_data[i]))
    return new_data

# Compute merged examples

In [8]:
def get_merged(data, num):
    new_data = list()
    for i, subject in enumerate(data):
        # copy and shuffle posts
        posts = [s for s in subject]
        random.shuffle(posts)
        combined = list()
        for j in range(0, len(subject)):
            if j%num == 0: combined.append(list())
            combined[int(j/num)].append(subject[j])
        new_data.append([" <SEP> ".join(c) for c in combined if (len(c) == num) ])
        #print(len(data[i]), len(new_data[i]))
    return new_data

In [9]:
def create_targets(X, label):
    return [[label for post in subject]for subject in X]

# Dataset setup

There are two types of examples for each subject: Merged, and APC (all possible combinations)
    Makes examples of 1, 2, or 3 posts from that subject

Experiment 1 will use merged for both training and testing
Experiment 2 will use APC for training, and merged for testing
Experiment 3 will use APC for both trainining and testing.

For each subject: 
-Compute APC, Merged examples
-Get bert, and tfidf embeddings for each example, and save to dictionary

{"violent":{"merged":{X:np.array


In [10]:
experiment_data = dict()

In [11]:
# create inputs
# just creates a list of subjects, where each subject is a lists of posts
X_violent = [[list(post.values())[0] for post in subject] for subject in data["violent"].values()]
X_non_violent = [[list(post.values())[0] for post in subject] for subject in data["non_violent"].values()]

In [12]:
# shuffle datasets
random.shuffle(X_violent)
random.shuffle(X_non_violent)

In [13]:
# print number of posts in each dataset
print("violent len: {}".format(sum([len(subject) for subject in X_violent])))
print("non_violent len: {}".format(sum([len(subject) for subject in X_non_violent])))

violent len: 575
non_violent len: 703


# Experiment

In [None]:
experiment = 3

In [27]:
num_posts_in_ex = 2

In [18]:
version = 4

In [28]:
exp_data = dict()
APC_X = get_APC(X_violent, num_posts_in_ex)
merged_X = get_merged(X_violent, num_posts_in_ex)
exp_data["violent"] = {"merged":{"X": np.array(merged_X), "Y": np.array(create_targets(merged_X, 1))},
                       "APC":{"X": np.array(APC_X), "Y": np.array(create_targets(APC_X, 1))}}

APC_X = get_APC(X_non_violent, num_posts_in_ex)
merged_X = get_merged(X_non_violent, num_posts_in_ex)
exp_data["non_violent"] = {"merged":{"X": np.array(merged_X), "Y": np.array(create_targets(merged_X, 0))},
                           "APC":{"X": np.array(APC_X), "Y": np.array(create_targets(APC_X, 0))}}

In [29]:
experiment_data["merged_violent_len"] = sum([len(subject) for subject in exp_data["violent"]["merged"]["X"]])
experiment_data["merged_non_violent_len"] = sum([len(subject) for subject in exp_data["non_violent"]["merged"]["X"]])
experiment_data["APC_violent_len"] = sum([len(subject) for subject in exp_data["violent"]["APC"]["X"]])
experiment_data["APC_non_violent_len"] = sum([len(subject) for subject in exp_data["non_violent"]["APC"]["X"]])

In [30]:
# print number of merged posts in each dataset
print("merged violent len: {}".format(experiment_data["merged_violent_len"]))
print("merged non_violent len: {}".format(experiment_data["merged_non_violent_len"]))
# print number of AFC posts in each dataset
print("APC violent len: {}".format(experiment_data["APC_violent_len"]))
print("APC non_violent len: {}".format(experiment_data["APC_non_violent_len"]))

merged violent len: 281
merged non_violent len: 346
APC violent len: 12197
APC non_violent len: 76568


# Define Functions

In [22]:
# creates train and val datasets for violent and non violent subjects
# concatonates them, and flattens to a single dimentional list
def get_datasets(train_index_v, test_index_v, train_index_nv, test_index_nv):
    if experiment == 1:
        train = "merged"
        test = "merged"
    elif experiment == 2:
        train = "APC"
        test = "merged"
    elif experiment == 3:
        train = "APC"
        test = "APC"
    
    
    # get examples and combine for violent and nonviolent
    X_train = list(np.concatenate((exp_data["violent"][train]["X"][train_index_v],
                                   exp_data["non_violent"][train]["X"][train_index_nv]), axis=0))
    X_test = list(np.concatenate((exp_data["violent"][test]["X"][test_index_v], 
                                  exp_data["non_violent"][test]["X"][test_index_nv]), axis=0))
    Y_train = list(np.concatenate((exp_data["violent"][train]["Y"][train_index_v], 
                                   exp_data["non_violent"][train]["Y"][train_index_nv]), axis=0))
    Y_test = list(np.concatenate((exp_data["violent"][test]["Y"][test_index_v], 
                                  exp_data["non_violent"][test]["Y"][test_index_nv]), axis=0))
    
    # flatten list
    X_train = [item for sublist in X_train for item in sublist]
    X_test = [item for sublist in X_test for item in sublist]
    Y_train = [item for sublist in Y_train for item in sublist]
    Y_test = [item for sublist in Y_test for item in sublist]
    
    print ("{}, {}".format(len(X_train), len(X_test)))
    return X_train, X_test, Y_train, Y_test
    

In [23]:
# get the accuracy, precision, recall, f1 for prediction
def evaluate(y_test, y_pred_log, results, name):
    if name not in results:
        results[name] = {"accuracy": list(), "precision": list(), "recall": list(), "F1": list()}
    results[name]["accuracy"].append(accuracy_score(y_test, list(y_pred_log)))
    results[name]["precision"].append(precision_score(y_test, list(y_pred_log)))
    results[name]["recall"].append(recall_score(y_test, list(y_pred_log)))
    results[name]["F1"].append(f1_score(y_test, list(y_pred_log)))

In [24]:
# shuffle items in train dataset
def shuffle_train_dataset(a, b):
    combined = list(zip(a, b))
    random.shuffle(combined)
    a[:], b[:] = zip(*combined)

# Run Models

For each fold, run all the models on all the representations and save each metrics to a dictionary

In [None]:
# initialize output dir
output_dir = os.path.join("results", "exp{}_len{}_ver{}.json".format(experiment, num_posts_in_ex, version))

In [None]:
# Init k-folds
kf = KFold(n_splits=5)

# initilaize results, info on train/test sets
all_results = {"metadata": experiment_data}
dataset_info = {"train_len": [], "val_len" : [], "v_train_pct":[], "v_val_pct":[]}
for (train_index_v, test_index_v), (train_index_nv, test_index_nv) in zip(kf.split(X_violent), kf.split(X_non_violent)):
    # print the examples for train/testing
    print("TRAIN v:", train_index_v, "TEST v:", test_index_v, "TRAIN nv:", train_index_nv, "TEST nv:", test_index_nv)
    
    # create training/testing datasets 
    X_train, X_test, Y_train, Y_test = get_datasets(train_index_v, test_index_v, train_index_nv, test_index_nv)
    
    # shuffle train dataset
    shuffle_train_dataset(X_train, Y_train)
    
    # save length of dataset sizes
    dataset_info["train_len"].append(len(X_train))
    dataset_info["val_len"].append(len(X_test))
    # save precentage of violent posts in each
    dataset_info["v_train_pct"].append(sum(Y_train)/len(X_train))
    dataset_info["v_val_pct"].append(sum(Y_test)/len(X_test))
    
    # for each input features
    for rep, function in features.items():
        print(rep)
        # get features
        X_train_features, X_test_features = function(X_train, X_test)
        
        # for each model
        for name, model in models.items():
            label = "{}_{}".format(rep, name)
            print(label)
            # get prediction
            pred_test = model(X_train_features, Y_train, X_test_features)
            # evaluate, record results
            evaluate(Y_test, pred_test, all_results, label)
            # save results
            all_results["data"] = dataset_info
            
            # find the average for all the folds
            final_results = {}
            for model, results in all_results.items():
                final_results[model] = {key: np.mean(value) for key, value in results.items() if key is not "data"}
            
            # save results
            with open(output_dir, "w") as f:
                json.dump({"all": all_results, "final": final_results}, f, indent=4)
    
    

In [None]:
final_results

In [None]:
Y_train

In [None]:
X_violent