In [1]:
%load_ext autoreload
%autoreload 
%reload_ext autoreload

In [32]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from features import topological_features, aggregate_features, get_vars, extract_features
import pickle
import rolx
import numpy as np
import utils
import random

def get_scores(train_pred, train_true, val_pred, val_true, test_pred, test_true):
    train_accuracy = np.mean(train_pred == train_true)
    print train_accuracy

    train_f1 =  precision_recall_fscore_support(train_true, train_pred)
    print train_f1[0][1]
    print train_f1[1][1]
    print train_f1[2][1]
    
    val_accuracy = np.mean(val_pred == val_true)
    print val_accuracy
    
    val_f1 =  precision_recall_fscore_support(val_true, val_pred)
    print val_f1[0][1]
    print val_f1[1][1]
    print val_f1[2][1]

    test_accuracy = np.mean(test_pred == test_true)
    print test_accuracy
    
    test_f1 =  precision_recall_fscore_support(test_true, test_pred)
    print test_f1[0][1]
    print test_f1[1][1]
    print test_f1[2][1]


In [3]:
def get_rolx(fname, fname_extended, roles=3):
    G, dict_to_graph, graph_to_dict = rolx.load_graph_igraph(fname, fname_extended)
    H, R = rolx.extract_rolx_roles(G, roles)
    print(H.shape, R.shape)
    H.tolist()

    adj_mat = G.get_adjacency()
    _, video_dict_list, graph_to_dict, neighbors, fields = get_vars(fname, fname_extended)
    # np.save('rolx_features', H)
    # H = np.load('rolx_features.npy')
    
    return adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields

def get_features(adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields, agg_flag=False):
    X = []
    y = []
    pos_data = []
    neg_data = []
    for row in range(adj_mat.shape[0]):
        H_row = np.array(H[row]).flatten()
        for col in range(adj_mat.shape[1]):
            H_total = np.array(H[col][0]).flatten() + H_row
            # print 'pre concatenated', type(H_total), H_total

            # flag for adding into agg and topo features
            if agg_flag:
                local_features = extract_features(video_dict_list, graph_to_dict, neighbors, fields, row, col) 
                # skip if doesnt exist
                if not local_features:
                    continue

                H_total = np.concatenate([H_total, local_features]) 
                # print 'after concatenated', type(H_total), H_total

            if adj_mat[row][col] > 0:
                pos_data.append((H_total, adj_mat[row][col]))
            else:
                neg_data.append((H_total, adj_mat[row][col]))
    
    return pos_data, neg_data

In [4]:
fname = './dataset/0222/0.txt'
fname_extended = './dataset/0222/1.txt'

adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields = get_rolx(fname, fname_extended)
pos_data, neg_data = get_features(adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields)

Creating Vertex Features matrix


  x_star, residuals, rank, s = lstsq(A, w)


V is a 3356 by 485 matrix.
Node-role matrix is of dimensions 3356 by 3
[[0.00126998 0.23255645 0.0053801 ]
 [0.         0.23278286 0.00700886]
 [0.         0.23278286 0.00700886]
 ...
 [0.         0.01115651 0.02619573]
 [0.         0.01115651 0.02619573]
 [0.         0.01244527 0.02430484]]
[[4.52246136e+04 1.42941075e-03 2.10000000e+01 ... 3.28100190e-04
  3.28100190e-04 2.10000000e+01]
 [1.63742690e-01 1.42797251e-03 2.00000000e+01 ... 3.07114183e-04
  3.07114183e-04 2.00000000e+01]
 [1.63742690e-01 1.42797251e-03 2.00000000e+01 ... 3.07114183e-04
  3.07114183e-04 2.00000000e+01]
 ...
 [0.00000000e+00 2.99759763e-04 4.00000000e+00 ... 1.75086647e-04
  1.75086647e-04 4.00000000e+00]
 [0.00000000e+00 2.99759763e-04 4.00000000e+00 ... 1.75086647e-04
  1.75086647e-04 4.00000000e+00]
 [0.00000000e+00 2.99759790e-04 5.00000000e+00 ... 2.10005872e-04
  2.10005872e-04 5.00000000e+00]]
[[0.00407925 0.         0.01161068 0.         0.         0.
  0.         0.01161172]
 [0.00833944 0.0513434

In [5]:
fname_test = './dataset/080327/0.txt'
fname_test_extended = './dataset/080327/1.txt'

adj_mat_test, H_test, video_dict_list_test, graph_to_dict_test, neighbors_test, fields_test = get_rolx(fname_test, fname_test_extended)
pos_data_test, neg_data_test = get_features(adj_mat_test, H_test, video_dict_list_test, graph_to_dict_test, neighbors_test, fields_test)

Creating Vertex Features matrix
V is a 4330 by 408 matrix.
Node-role matrix is of dimensions 4330 by 3
[[2.18967024 0.32158417 0.0033941 ]
 [2.43212908 0.25124375 0.01179091]
 [1.45787963 0.45599134 0.01184172]
 ...
 [0.21869531 0.17029455 0.1291226 ]
 [0.54564185 0.15036211 0.15531668]
 [0.27499615 0.19094136 0.13176154]]
[[1.27113060e+02 2.38829560e-04 3.30000000e+01 ... 3.89347401e-04
  3.89347401e-04 3.30000000e+01]
 [1.60207092e+02 2.38829612e-04 3.70000000e+01 ... 4.32542313e-04
  4.32542313e-04 3.70000000e+01]
 [1.80684843e+02 2.38829428e-04 2.30000000e+01 ... 2.86518924e-04
  2.86518924e-04 2.30000000e+01]
 ...
 [1.86289381e+03 4.83376818e-04 1.10000000e+01 ... 2.02291225e-04
  2.02291225e-04 1.10000000e+01]
 [1.26525601e+03 4.83377303e-04 2.00000000e+01 ... 2.92377077e-04
  2.92377077e-04 2.00000000e+01]
 [1.51425951e+02 4.83314162e-04 1.20000000e+01 ... 1.93842442e-04
  1.93842442e-04 1.20000000e+01]]
[[0.         0.         0.02432228 0.         0.         0.
  0.         0.

In [6]:
def split_data_balanced(pos_data, neg_data):
    # creates positive and negative dataset for more uniform distribution of data
    X = [pos_data[i][0] for i in range(len(pos_data))]
    Y = [pos_data[i][1] for i in range(len(pos_data))]

    random_indices = sorted(random.sample(range(len(neg_data)), len(X)))
    X_neg = [neg_data[i][0] for i in random_indices]
    Y_neg = [neg_data[i][1] for i in random_indices]

    X.extend(X_neg)
    Y.extend(Y_neg)

    X_array = np.array(X)
    Y_array = np.array(Y)
    
    print X_array.shape, Y_array.shape
    from sklearn.preprocessing import normalize
    # change this line to change the number of features
    X_array = X_array[:, np.r_[:3]]
    print X_array.shape

    # runs training by splitting train/test sets
    return train_test_split(X_array, Y_array, test_size=0.2, random_state=42)

In [43]:
def split_data(pos_data, neg_data):
    # runs training by splitting train/test sets
    X = [pos_data[i][0] for i in range(len(pos_data))]
    Y = [pos_data[i][1] for i in range(len(pos_data))]

    X_neg = [neg_data[i][0] for i in range(len(neg_data))]
    Y_neg = [neg_data[i][1] for i in range(len(neg_data))]

    X.extend(X_neg)
    Y.extend(Y_neg)

    X_array = np.array(X)
    Y_array = np.array(Y)
    X_array = X_array[:, np.r_[:3]]

    X_train, X_test, y_train, y_test = train_test_split(X_array, Y_array, test_size=0.01, random_state=42)

    test_zero_vals = np.argwhere(y_test == 0)
    test_one_vals = np.argwhere(y_test == 1)
    print 'test zero vals', len(test_zero_vals), 'test one vals', len(test_one_vals)

    zero_vals = np.argwhere(y_train == 0)
    one_vals = np.argwhere(y_train == 1)
    random_indices = zero_vals[sorted(random.sample(range(len(zero_vals)), len(one_vals)))]
    random_indices = np.concatenate([random_indices, one_vals]).reshape(-1)

    X_train = X_train[random_indices]
    y_train = y_train[random_indices]
    print X_train.shape, y_train.shape

    train_zero_vals = np.argwhere(y_train == 0)
    train_one_vals = np.argwhere(y_train == 1)
    print 'train zero vals', len(train_zero_vals), 'train one vals', len(train_one_vals)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_val, y_train, y_val = split_data(pos_data, neg_data)
_, X_test, _, y_test = split_data(pos_data_test, neg_data_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf.fit(X_train, y_train)

print 'random forest'
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
np.savetxt('dataset/results.txt', test_predictions)

In [None]:
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)
print 'logistic regression'
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
np.savetxt('dataset/results.txt', test_predictions)

In [None]:
from sklearn import svm

svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

print 'svm_rbf'
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
# np.savetxt('dataset/results.txt', test_predictions)

In [None]:
from sklearn import svm

svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

print 'svm linear'
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
# np.savetxt('dataset/results.txt', test_predictions)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

print 'knn'
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)

In [None]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)

print 'naive bayes'
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)