In [3]:
###############################################################################
##                  author: Mohsen Mesgar
###############################################################################
import numpy as np
import multiprocessing as mp
import networkx as nx
from gensim.models import Word2Vec
from itertools import chain, combinations
from collections import defaultdict
import sys, copy, time, math, pickle
import cPickle as cpickle
import itertools
import scipy.io
import pynauty
from scipy.spatial.distance import pdist, squareform
import glob
import os
import re
import sys, getopt
import scipy as sp
import matplotlib.pyplot as plt
from subgraph_struc import read, write,get_canonical_map,draw, graph_to_adj_matrix as graph2am, recover_graph, draw
from graph_set import read_graph_set as read_gs

import torch
torch.cuda.set_device(0)

In [4]:
import sys
def drawProgressBar(shell_out, 
                    begin, k, out_of, end, barLen =25):
    percent = k/float(out_of)
    sys.stdout.write("\r")
    progress = ""
    for i in range(barLen):
        if i < int(barLen * percent):
            progress += "="
        elif i==int(barLen * percent):
            progress +='>'
        else:
            progress += "_"
    text = "%s%d/%d[%s](%.2f%%)%s"%(begin,k,out_of,progress,percent * 100, end)
    if shell_out== True:
        sys.stdout.write(text)
        sys.stdout.flush()
    return text

In [5]:
###############################################################################
##                   get pattern canonical map without node order
###############################################################################
def get_canonical_map(g):
    if len(g.nodes())>0:
        a = nx.adjacency_matrix(g)
        am = a.todense()
        window = np.array(am)
        adj_mat = {idx: [i for i in list(np.where(edge)[0]) if i!=idx] for idx, edge in enumerate(window)}
#       This line doesn't take into account the order of nodes, it produces the identical
#       canonoical map for these graphs
#       0-->1 2, 0 1-->2, 0-->2 1
#        tmp = pynauty.Graph(number_of_vertices=len(g.nodes()), directed=True, adjacency_dict = adj_mat) 

        tmp = pynauty.Graph(number_of_vertices=len(g.nodes()), directed=True, adjacency_dict = adj_mat, 
                    vertex_coloring = [set([t]) for t in range(len(g.nodes(0)))],) 

        cert = pynauty.certificate(tmp)
    else:
        cert = ''
    return cert

In [6]:
###############################################################################
##                               read graph maps
###############################################################################
def get_maps(can_map_file, count_file):
    # canonical_map -> {canonical string id: {"graph", "idx", "n"}}
    canonical_map = read(can_map_file)
    
   
   # weight map -> {parent id: {child1: weight1, ...}}
    weight_map = read(count_file)
    
    
    weight_map = {parent: {child: weight/float(sum(children.values())) for child, weight in children.items()} 
                    for parent, children in weight_map.items()}
    child_map = {}
    for parent, children in weight_map.items():
        for k,v in children.items():
            if k not in child_map:
                child_map[k] = {}
            child_map[k][parent] = v
    weight_map = child_map
    return canonical_map, weight_map 

In [7]:
###############################################################################
##                  compute the base probability
###############################################################################
def pb(graph_id, weight_map):
    parents =  weight_map[graph_id] 
    total = 0    
    for k,w in parents.items():
        total = w*pb(k, weight_map)
    return total

In [8]:
import random
random.seed(1)
###############################################################################
##                compute the count of each pattrn in each graph
###############################################################################
def pattern_counter_in_graph(inputs):
    gidx = inputs[0]
    graph = inputs[1]
    min_pattern_size = inputs[2] 
    max_pattern_size = inputs[3]
    samplesize = inputs[4] 
    canonical_map = inputs[5]
    
    # in case we don't observe any graphlet in the graph, we fallback to the graphlet id that has zero edges in it
    #fallback_map = {1: 1, 2: 2, 3: 4, 4: 8, 5: 19, 6: 53,       7: 209, 8: 1253, 9: 13599}
    fallback_map = {1: 1, 2: 3, 3: 11, 4: 75, 5: 1099, 6: 13901}
    # initialize the seed 	
    seed = 1        
    np.random.seed(seed)   
    
    am = graph2am(graph)
    graph_size = len(am)
    
    # count_map = {node id: absolute count, ...}
    count_map = {}
    
    
    for pattern_size in range(min_pattern_size, max_pattern_size+1):
        #print "pattern_size=", pattern_size
        
        # we don't need to loop if size of the adj. matrix is smaller than n        
        if graph_size >= pattern_size:
            count = 0
            sample_set =[]
            ub = scipy.misc.comb(graph_size, pattern_size)
            while (len(sample_set) <= samplesize and len(sample_set) < ub):
                #print "sample_set=", sample_set
                r = random.sample(range(graph_size), pattern_size)
                r_sort = np.sort(r).tolist()
                
                #print "r",r
                #print "r_sort",r_sort
                
                if sample_set.count(r_sort)==0:
                    sample_set.append(r_sort)
                    count = count + 1
                #print "count",count
                
        #    for s in range(samplesize):
            #print "final_sample_set=", sample_set
            #print "final_count", count
            for s in sample_set:
                #print "sample=",s
                window = am[np.ix_(s,s)]
 
                # fekr konam window bayyad ye jori graph bashe
                pattern = nx.DiGraph(window)
                g_type = canonical_map[get_canonical_map(pattern)]["idx"]               
                #print "g_type", g_type
                
                # increment the count of seen graphlet
                count_map[g_type] = count_map.get(g_type,0)+ 1.0
                #print  "count_map[g_type]", count_map[g_type]

        else:
            # fallback to 0th node at that level
            count_map[fallback_map[pattern_size]] = samplesize
            #print "In_fall_back","count_map[fallback_map[pattern_size]]",count_map[fallback_map[pattern_size]]
  
    return (gidx, count_map)

In [9]:
###############################################################################
##           compute the count of subgraphs for each graph in graph set
## graph_set: a dictionary of graphs and their id. {idx1:graph1, idx2:graph2,...}
###############################################################################
from joblib import Parallel, delayed
def count_subgraphs(graph_set_file, min_pattern_size, max_pattern_size, sample_size, can_map, output_file):
 
    ## read graph_set
    print "loading the graph_set_file ...: %s"%graph_set_file
    graph_set = read_graph_set(graph_set_file)
    print "# graphs in graph_set: %d"%len(graph_set)

    #    for gidx, value in graph_set.items():
#        graph = value['graph']
        #print "graph_name ="+ value['name'] +" id=" + str(gidx)
#        graph_map[gidx] = sample_worker(graph,min_pattern_size,max_pattern_size, sample_size, can_map)
        #print 'graph_'+str(gidx)+' is processed'
    
    print "start counting patterns ..."
    input_graphs = [(gidx, value['graph'],min_pattern_size,max_pattern_size, sample_size, can_map) for gidx, value in graph_set.items()]  
    
    graph_map = []
    for i,graph in enumerate(input_graphs):
        graph_map.append(pattern_counter_in_graph(graph))
        drawProgressBar(shell_out=True, 
                    begin="", 
                        k=i+1, out_of=len(input_graphs), 
                        end="")
#     graph_map = Parallel(n_jobs=2, verbose=1, backend="multiprocessing")(
#        map(delayed(pattern_counter_in_graph), input_graphs))

    ## which patterns occures how often
    graph_map = { x:y for (x,y) in graph_map}   
    
    write(graph_map, output_file)
    print "\ngraph_map is saved here : %s"%output_file
    return graph_map
 

In [10]:
   
###############################################################################
##                               read graph set
###############################################################################
def read_graph_set(graph_set_file):
    return read_gs(graph_set_file)

In [11]:
###############################################################################
##                      find the ids of all patterns with ps nodes
## ps: pattern size
###############################################################################
def k_node_graphs(can_map, ps):
    #output = {v['idx']  for k,v in can_map.items() if v['n']==ps }
    output = {v['idx']  for k,v in can_map.items() if v['n']== ps and 
              nx.is_weakly_connected(get_subgraph(v['idx'], can_map))}
    return output

In [12]:
###############################################################################
##         compute the sum over all count of k-node subgraphs
## can_map = {graph_canonical_map:{'graph':..., 'idx':,..., 'n':....}}
## k: k-node subgraphs, it shows the depth of the tree himap
###############################################################################
def z(all_knode_patterns, pat_cnt):
    filter_pattern_count = {k:v for k,v in pat_cnt.items() if (k in all_knode_patterns)}
    return float(0.1+sum([v for v in filter_pattern_count.values()]))
    #return float(sum([v for v in filter_pattern_count.values()]))

In [13]:
###############################################################################
##                 n_c: number of paterns with exatly count e
###############################################################################
def n(e, pat_cnt):
    l= pat_cnt.values()
    return float(l.count(e))

In [14]:
###############################################################################
##                       compute discount value d
## n_c:number of patterns with exactly count c
###############################################################################
def disc(c, pat_cnt):
    n1 = n(1, pat_cnt)
    n2 = n(2, pat_cnt)
    n3 = n(3, pat_cnt)
    n4 = n(4, pat_cnt)
    y = float(n1) / n1+2*n2
    if c==0:
        return 0
    elif c==1:
        return y
    elif c==2:
        return 2-3*y*(float(n3)/n2)
    else:
        return 2-3*y*(float(n4)/n3)
 

In [15]:
###############################################################################
##               compute probability based on the frequency
###############################################################################
def pf(pattern_idx,pattern_count, d, z_value):
    if (pattern_idx in pattern_count.keys()):
        count = pattern_count[pattern_idx]
    else:
        count = 0
   # d = disc(count, pattern_count)
    nominator = max(count-d,0)
    
    denominator = z_value
    prob =  float(nominator)/float(denominator)
    return prob

In [16]:
###############################################################################
##                 Normalization factor for base probability
###############################################################################
def norm_fact(all_knode_patterns, pattern_count, d):
    filter_pattern_count = {k:v for k,v in pattern_count.items() if (k in all_knode_patterns)}
    num_nn = len([v for v in filter_pattern_count.values() if v >= d])
    b= sum([v for v in filter_pattern_count.values() if v < d])
    return num_nn, b
   

In [17]:
 
###############################################################################
##                               Mass value
###############################################################################
def mass(d, z_value, norm_fact, bounes):    
    return (d/z_value)*norm_fact +(bounes/z_value)

In [18]:
###############################################################################
##                       compute base probability of a pattern
## pb('')=pb(1)=1 because those occur in every possible graph
###############################################################################
def pb(wm, parent_kn,  pattern_id):
    prob_base = 0
    if pattern_id ==0 :
        prob_base=1
    else:
        for parent_id, weight in wm[pattern_id].items():
            prob_base = prob_base + pb(wm,parent_kn, parent_id)*weight
            #prob_base += (parent_kn[parent_id]*weight)
    return prob_base  

In [19]:
###############################################################################
##           KN probability of the given pattern in the given graph
## pattern count == pc[graph_id]
## ps is pattern_size= number of nodes
###############################################################################
def pkn(can_map, pattern_count, w_map,parent_kn, pattern_idx, pattern_size, d, z_value, all_knode_patterns):
   # all_knode_patterns = k_node_graphs(can_map, pattern_size)
   # z_value = z(all_knode_patterns, pattern_count)
    p1= pf(pattern_idx, pattern_count, d, z_value)
    if (d==0):
        pkn = p1 
    else:
        p2 = pb(w_map,parent_kn, pattern_idx)
        mass_factor , bonus = norm_fact(all_knode_patterns, pattern_count, d)
        mass_value = mass(d, z_value,mass_factor, bonus)
        pkn = p1 + (mass_value*p2)
        
    parent_kn[pattern_idx] = pkn
    return pkn
    

In [20]:
###############################################################################
##                          compute graph vector
## pc : pattern count in each graph of graph_set
###############################################################################
def get_graph_vector(pc, can_map, wei_map,parent_kn, number_nodes, d):
    graph_vectores = {}
    all_knode_patterns = k_node_graphs(can_map, number_nodes)
    #print all_knode_patterns
    for graph_id, patt_cnt in pc.items():
        tmp_vect = {}
        #print pc[graph_id]
        #print all_knode_patterns
        z_value = z(all_knode_patterns, pc[graph_id])
        #print "graph_id="+str(graph_id) + " z_value=" + str(z_value)
        for pid in k_node_graphs(can_map, number_nodes):
            p_pkn = pkn(can_map, pc[graph_id], wei_map,parent_kn, pid, number_nodes, d,z_value,all_knode_patterns)
            tmp_vect[pid]=p_pkn
        graph_vectores[graph_id] = tmp_vect
    return graph_vectores

In [21]:
###############################################################################
##                      find a graph in the can_map
###############################################################################
def get_subgraph(gidx, can_map):
    tmp = [t for t in can_map.values() if t['idx']==int(gidx)][0]
    graph= tmp['graph']
    n= tmp['n']
    g = recover_graph(graph,n, gidx)
    return g

In [22]:
###############################################################################
##                 data points for classification
###############################################################################    
def data_points(hs, m):
    instances = []
    count = 0
    for i in range(1,len(hs)+1):
        for j in range(i+1, len(hs)+1):
            label = -1 #'B'
            d = hs[i]-hs[j]
            if (math.fabs(d)>0.5):
                if d>0:
                    label = +1#'A'
                count = count + 1
                inst =m[i-1,:].tolist()[0] + m[j-1,:].tolist()[0]+[label]
                instances.append(inst) 
    return instances

In [23]:
def get_count_of_connected_patterns_of_a_graph(pc_graph, can_map):
    output = []
    for idx in pc_graph.keys():
        g = get_subgraph(idx,can_map)
        if nx.is_weakly_connected(g):
            #print "idx: %d"%idx
            #print "nodes : %s"%g.nodes()
            #print "edges : %s"%g.edges()
            #print "count : %d"%pc_graph[idx]
            #print "------"
            output.append((idx,pc_graph[idx]))
    return output

In [24]:
class count_matrix(object):
    def __init__(self, name, pattern_ids, graph_ids,count_matrix):
        self.pattern_ids = pattern_ids
        self.graph_ids = graph_ids
        self.count_matrix = count_matrix
        self.name = name
    
    def display_patterns(self, can_map):
        for idx in self.pattern_ids:
            g = get_subgraph(idx,can_map)
            print "idx: %d"%idx
            print "nodes : %s"%g.nodes()
            print "edges : %s"%g.edges()
            print "------"

In [25]:
x_orig = []
y_orig = []
x_tran = []
y_tran = []
max_len = 0
for gs_id,graph_set_file in enumerate(["./orig_graph_set.g","./trans_graph_set.g"]):

    ## read graph_set
    print "loading the graph_set_file ...: %s"%graph_set_file
    graph_set = read_graph_set(graph_set_file)
    print "# graphs in graph_set: %d"%len(graph_set)
    
    
    
    for gidx, value in graph_set.items():
        graph = value['graph']
        D = nx.to_numpy_matrix(graph)
        max_len = np.max([max_len,D.shape[0]])
        if gs_id ==0:
            x_orig.append(D)
            y_orig.append(1)
        else:
            x_tran.append(D)
            y_tran.append(0)


loading the graph_set_file ...: ./orig_graph_set.g
# graphs in graph_set: 5000
loading the graph_set_file ...: ./trans_graph_set.g
# graphs in graph_set: 5000


In [26]:
x_orig_padded = []
x_tran_padded = []
for id,x_list in enumerate([x_orig,x_tran]):
    for x in x_list:
        x_padded = np.zeros((max_len,max_len))
        x_padded[:x.shape[0],:x.shape[1]] = x
        if id ==0:
            x_orig_padded.append(x_padded)
        else:
            x_tran_padded.append(x_padded)


In [27]:
print x_orig_padded[0].shape

(703, 703)


In [67]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=1,kernel_size=3)
        self.pool = nn.MaxPool2d(2, 2)
    def forward(self, x):
        # x.shape = (bacth_size,1, max_len, max_len)
        x = self.conv1(x) # (bacth_size, 1, max_len-kernel_size+1,max_len-kernel_size+1 )
        x = F.relu(x)# (bacth_size, 1, max_len-kernel_size+1,max_len-kernel_size+1 )
        x= self.pool(x)# (bacth_size, 1, max_len-kernel_size+1/pool_size,
        #max_len-kernel_size+1/pool_size )
        return x


# net = Net()
# batch_size = 2
# x = torch.from_numpy(np.array(x_orig_padded[:batch_size]))
# x = Variable(x).type(torch.FloatTensor)
# print x.size()
# x = x.view(batch_size,1,max_len,max_len)
# print x.size()
# out_cnn = net(x)
# print out_cnn.size()

In [29]:
class Classifier(nn.Module):
    def __init__(self,input_size):
        super(Classifier, self).__init__()
        self.input_size = input_size
        self.linear = nn.Linear(self.input_size,2)
    def forward(self, x):
        x = self.linear(x)
        x = F.log_softmax(x)
        return x
    

# out_cnn = out_cnn.view(batch_size,-1)
# cls = classifier(350*350)
# out_classifier = cls(out_cnn)

In [30]:
x = x_orig + x_tran
y = [1]*5000 + [0]*5000

#shuffle the data
tmp = list(zip(x, y))
rng = np.random.RandomState(0)
rng.shuffle(tmp)
x, y = zip(*tmp)

x= np.array(x)
y= np.array(y)

# do cross_validation
import numpy as np
from sklearn.model_selection import RepeatedKFold

rng = np.random.RandomState(0)
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=rng) 
# for train, test in splits:
#      print("%s %s" % (train, test))

In [31]:
datasets = [(train,test) for train,test in rkf.split(x)]

In [52]:
def train_model(X,Y, batch_size,max_len, cnn_model, cls_model, optimizer):
    
    num_batches = len(X) / batch_size
    X = X[:num_batches*batch_size]
    Y = Y[:num_batches*batch_size]
    epoch_loss = 0.0
 
    for batch_index in range(num_batches):
        optimizer.zero_grad() 
        x_batch = X[batch_index*batch_size: (batch_index+1)*batch_size]
        y_batch = Y[batch_index*batch_size: (batch_index+1)*batch_size]
        
        
        x_batch_padded = []
        for x in x_batch:
            x_padded = np.zeros((max_len,max_len))
            x_padded[:x.shape[0],:x.shape[1]] = x 
            x_batch_padded.append(x_padded)
                
        x = torch.from_numpy(np.array(x_batch_padded))
        x = Variable(x).type(torch.FloatTensor)
        x = x.view(batch_size,1,max_len,max_len)
        if cuda.is_available():
            x = x.cuda()
        cnn_out = cnn_model(x)
        
        cnn_out = cnn_out.view(batch_size,-1)
        
        out_classifier = cls_model(cnn_out)
           
        # compute the loss function or criteria
        y = torch.from_numpy(np.array(y_batch))
        y = Variable(y).type(torch.LongTensor)
        if cuda.is_available():
            y= y.cuda()
        loss = F.cross_entropy(out_classifier, y)
        
        epoch_loss +=loss.data
        
        # update parameters
        loss.backward()    
        optimizer.step()

    return epoch_loss/num_batches

In [65]:
def evaluate(X,Y,cnn_model,cls_model, batch_size=2):
    
    num_batches = len(X) / batch_size
    X = X[:num_batches*batch_size]
    Y = Y[:num_batches*batch_size]
    
    corrects = 0.0
    num_test_samples = 0.0
    for batch_index in range(num_batches):
        
        x_batch = X[batch_index*batch_size: (batch_index+1)*batch_size]
        y_batch = Y[batch_index*batch_size: (batch_index+1)*batch_size]
        
        
        x_batch_padded = []
        for x in x_batch:
            x_padded = np.zeros((max_len,max_len))
            x_padded[:x.shape[0],:x.shape[1]] = x 
            x_batch_padded.append(x_padded)
                
        x = torch.from_numpy(np.array(x_batch_padded))
        x = Variable(x).type(torch.FloatTensor)
        x = x.view(batch_size,1,max_len,max_len)
        if cuda.is_available():
            x = x.cuda()
        
        cnn_out = cnn_model(x)
        
        cnn_out = cnn_out.view(batch_size,-1)
        
        
        out_classifier = cls_model(cnn_out)
           
        # compute the loss function or criteria
        y = torch.from_numpy(np.array(y_batch))
        y = Variable(y).type(torch.FloatTensor)
        
        predicted_label,predicted_index =  torch.topk(out_classifier,1)
        predicted_index = predicted_index.type(torch.FloatTensor).squeeze()
            
        correct_in_batch =  sum(torch.eq(predicted_index,y).data)
        
        corrects +=  correct_in_batch
        
        num_test_samples += batch_size 

    acc = corrects / num_test_samples
    return acc    

In [66]:
i= 0 
torch.manual_seed(0)
from torch import cuda

for train, test in datasets:   
    if i>0:
        break
    i +=1
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    
    #limit x_train,y_train
    x_train = x_train
    y_train = y_train
    
    x_test = x_test
    y_test = y_test
    # define model
    cnn = Net()
    classifier = Classifier(350*350) # how should we compute this 350?
    if cuda.is_available():
        cnn = cnn.cuda()
        classifier = classifier.cuda()

    optimizer = torch.optim.SGD(list(cnn.parameters())+list(classifier.parameters()), lr=0.01)

    #train the models
    num_epochs = 200
    for epoch in range(num_epochs):
        epoch_loss = train_model(X=x_train,Y=y_train, 
                                 batch_size=25, max_len=703,
                                 cnn_model=cnn, cls_model=classifier,
                                 optimizer = optimizer)
        if epoch % 10 == 0:
            print "loss: %f"%epoch_loss[0]
            print "acc_test= %f"%evaluate(x_test,y_test,cnn,classifier,1)
    print "acc_train= %f"%evaluate(x_train,y_train,cnn,classifier,1)

loss: 0.732673
acc_test= 0.493000
loss: 0.693210
acc_test= 0.497000
loss: 0.693201
acc_test= 0.497000


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/mesgarmn/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/mesgarmn/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
  File "/home/mesgarmn/anaconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/home/mesgarmn/anaconda2/lib/python2.7/inspect.py", line 1051, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/home/mesgarmn/anaconda2/lib/python2.7/inspect.py", line 1011, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/home/mesgarmn/anaconda2/lib/python2.7/inspect.py", line 453, in getsourcefile
    if hasattr(getmodule(object, filename), '__loader

IndexError: string index out of range

In [62]:
print "acc_test= %f"%evaluate(x_test,y_test,cnn,classifier,1)

torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 491401])
torch.Size([1, 4

KeyboardInterrupt: 

In [None]:
###############################################################################
##  Here we go step by step over our experiments on translationese and original
###############################################################################
def hansard_experiment(num_nodes):
    min_pattern_size = num_nodes
    max_pattern_size = num_nodes
    sample_size = 2000 # numbrt of samples
    
    
    
    normalized = True
    print "min_pattern_size: %d"%min_pattern_size
    print "max_pattern_size: %d"%max_pattern_size
    print "sample_size: %d"%sample_size
    
    
    can_map_file = "./canonical_map/can_map_maxk6.p"
    himap_file = "./canonical_map/himap_maxk6.p"

    
    subgraph_count_file = "./count_orig_graph_set"+"_min:"+ str(min_pattern_size)+"_max:"+str(max_pattern_size)
    
    print "loading can_map and hi_map: %s %s"%(can_map_file, himap_file)
    can_map, weight_map = get_maps(can_map_file, himap_file)    
       
    output = []
    for gs_id,graph_set_file in enumerate(["./orig_graph_set.g","./trans_graph_set.g"]):
        print "processing: %s "%graph_set_file
        
        pc = count_subgraphs(graph_set_file,
                             min_pattern_size, max_pattern_size,
                            sample_size,
                             can_map, 
                             subgraph_count_file)

        print "pattern counting is done."
    
        all_count_matrices = {}
        print "computing the count matrices ..."
        for num_nodes in range(min_pattern_size,max_pattern_size+1):
            #print "pattern_size: %d"%num_nodes
            connected_patterns_idx = list(k_node_graphs(can_map,num_nodes))
            #print "list of all possible connected patterns (columns): %s"%connected_patterns_idx
            num_graphs = len(pc.keys())
            num_patterns = len(connected_patterns_idx)
            cnt_matrix = np.zeros((num_graphs,num_patterns))
            #print "graph ids in rows of count_matrix: %s" %pc.keys()
            for key in pc.keys():
                count  = get_count_of_connected_patterns_of_a_graph(pc[key], can_map)
                row = key
                for (pattern_id, value) in count:
                    if pattern_id not in connected_patterns_idx:
                        continue
                    col = connected_patterns_idx.index(pattern_id)
                    cnt_matrix[row, col] = value
            cm = count_matrix(num_nodes, connected_patterns_idx,pc.keys(),  cnt_matrix)
            all_count_matrices[num_nodes] = cm 
        print "all connected patterns are counted"
        output.append((graph_set_file, pc, can_map, all_count_matrices))
    
    return output

In [None]:
###############################################################################
##  Here we go step by step over our experiments on translationese and original
###############################################################################
def hansard_baseline(num_nodes):
    min_pattern_size = num_nodes
    max_pattern_size = num_nodes
    sample_size = 2000 # numbrt of samples
    
    
    
    normalized = True
    print "min_pattern_size: %d"%min_pattern_size
    print "max_pattern_size: %d"%max_pattern_size
    print "sample_size: %d"%sample_size
    
    
    can_map_file = "./canonical_map/can_map_maxk6.p"
    himap_file = "./canonical_map/himap_maxk6.p"

    
    subgraph_count_file = "./count_orig_graph_set"+"_min:"+ str(min_pattern_size)+"_max:"+str(max_pattern_size)
    
    print "loading can_map and hi_map: %s %s"%(can_map_file, himap_file)
    can_map, weight_map = get_maps(can_map_file, himap_file)    
       
    output = []
    for gs_id,graph_set_file in enumerate(["./orig_lcg_graph_set.g","./trans_lcg_graph_set.g"]):
        print "processing: %s "%graph_set_file
        
        pc = count_subgraphs(graph_set_file,
                             min_pattern_size, max_pattern_size,
                            sample_size,
                             can_map, 
                             subgraph_count_file)

        print "pattern counting is done."
    
        all_count_matrices = {}
        print "computing the count matrices ..."
        for num_nodes in range(min_pattern_size,max_pattern_size+1):
            #print "pattern_size: %d"%num_nodes
            connected_patterns_idx = list(k_node_graphs(can_map,num_nodes))
            #print "list of all possible connected patterns (columns): %s"%connected_patterns_idx
            num_graphs = len(pc.keys())
            num_patterns = len(connected_patterns_idx)
            cnt_matrix = np.zeros((num_graphs,num_patterns))
            #print "graph ids in rows of count_matrix: %s" %pc.keys()
            for key in pc.keys():
                count  = get_count_of_connected_patterns_of_a_graph(pc[key], can_map)
                row = key
                for (pattern_id, value) in count:
                    if pattern_id not in connected_patterns_idx:
                        continue
                    col = connected_patterns_idx.index(pattern_id)
                    cnt_matrix[row, col] = value
            cm = count_matrix(num_nodes, connected_patterns_idx,pc.keys(),  cnt_matrix)
            all_count_matrices[num_nodes] = cm 
        print "all connected patterns are counted"
        output.append((graph_set_file, pc, can_map, all_count_matrices))
    
    return output

In [None]:
count_subgraphs

In [None]:
### LCG count 3-node
output = hansard_baseline(num_nodes=3)
# with open('./final_pattern_count_3_lcg.pkl','wb') as h:
#     cpickle.dump(output,h)

In [None]:
#load 3-node count
with open('./final_pattern_count_3_lcg.pkl','r') as h:
    output = cpickle.load(h)

In [None]:
pc_original, can_map_original, all_count_matrices_original = \
    output[0][1],output[0][2],output[0][3]

pc_trans, can_map_trans, all_count_matrices_trans = \
    output[1][1],output[1][2],output[1][3]

In [None]:
three_nodes_original = all_count_matrices_original[3]
#three_nodes_original.display_patterns(can_map_original)

In [None]:
three_nodes_trans = all_count_matrices_trans[3]
#three_nodes_trans.display_patterns(can_map_trans)

In [None]:
#x_original = softmax(three_nodes_original.count_matrix)
x_original = three_nodes_original.count_matrix

In [None]:
#x_trans = softmax(three_nodes_trans.count_matrix)
x_trans = three_nodes_trans.count_matrix

In [None]:
x =  np.concatenate((x_original, x_trans), axis=0)
y = [1]*5000 + [0]*5000

In [None]:
from sklearn.utils import shuffle
x,  y = shuffle(x,  y, random_state=0)

In [None]:
s =  x.sum(1,keepdims=True)*1.0 
x = x/ s

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib import figure
# from sklearn.manifold import TSNE

# X_tsne = TSNE(learning_rate=100).fit_transform(x)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()

In [None]:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
import sklearn
from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = sklearn.linear_model.LogisticRegression(random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_3node_LR = []
for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    print i,acc
    predictions_3node_LR+= list(predicted)
print "acc 5-CV = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
## comute 4-node count
output = hansard_experiment(num_nodes=4)
with open('./final_pattern_count_4_lcg.pkl','wb') as h:
     cpickle.dump(output,h)

In [None]:
#load 4node count
with open('./final_pattern_count_4_lcg.pkl','r') as h:
    output = cpickle.load(h)

In [None]:
pc_original, can_map_original, all_count_matrices_original = \
    output[0][1],output[0][2],output[0][3]

pc_trans, can_map_trans, all_count_matrices_trans = \
    output[1][1],output[1][2],output[1][3]

In [None]:
four_nodes_original = all_count_matrices_original[4]
#three_nodes_original.display_patterns(can_map_original)

In [None]:
four_nodes_trans = all_count_matrices_trans[4]
#three_nodes_trans.display_patterns(can_map_trans)

In [None]:
#x_original = softmax(four_nodes_original.count_matrix)
x_original = four_nodes_original.count_matrix

In [None]:
#x_trans = softmax(four_nodes_trans.count_matrix)
x_trans = four_nodes_trans.count_matrix

In [None]:
x =  np.concatenate((x_original, x_trans), axis=0)
y = [1]*5000 + [0]*5000

In [None]:
from sklearn.utils import shuffle
x,  y = shuffle(x,  y, random_state=0)

In [None]:
s = x.sum(1,keepdims=True)*1.0
x = x/ s

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib import figure
# from sklearn.manifold import TSNE

# X_tsne = TSNE(learning_rate=100).fit_transform(x)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()

In [None]:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
import sklearn
from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = sklearn.linear_model.LogisticRegression(random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_4node_LR = []
for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    print i,acc
    predictions_4node_LR += list(predicted)
print "acc 5-CV = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
from sklearn.utils import shuffle
x,  y = shuffle(x,  y, random_state=0)

s =  x.sum(1,keepdims=True)*1.0 
x = x/ s

# import matplotlib.pyplot as plt
# from matplotlib import figure
# from sklearn.manifold import TSNE

# X_tsne = TSNE(learning_rate=100).fit_transform(x)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
import sklearn
from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = sklearn.linear_model.LogisticRegression(random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_3node_LR = []
for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    print i,acc
    predictions_3node_LR+= list(predicted)
print "acc 5-CV = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
   
###############################################################################
##                               Main
###############################################################################
import multiprocessing
from multiprocessing import Process

#output = hansard_experiment(num_nodes=3)
#with open('./final_pattern_count_3.pkl','wb') as h:
#    cpickle.dump(output,h)

In [None]:
#load 3-node count
with open('./final_pattern_count_3.pkl','r') as h:
    output = cpickle.load(h)

In [None]:
pc_original, can_map_original, all_count_matrices_original = \
    output[0][1],output[0][2],output[0][3]

pc_trans, can_map_trans, all_count_matrices_trans = \
    output[1][1],output[1][2],output[1][3]


In [None]:
three_nodes_original = all_count_matrices_original[3]
#three_nodes_original.display_patterns(can_map_original)

In [None]:
three_nodes_trans = all_count_matrices_trans[3]
#three_nodes_trans.display_patterns(can_map_trans)

In [None]:
#x_original = softmax(three_nodes_original.count_matrix)
x_original = three_nodes_original.count_matrix

In [None]:
#x_trans = softmax(three_nodes_trans.count_matrix)
x_trans = three_nodes_trans.count_matrix

In [None]:
x =  np.concatenate((x_original, x_trans), axis=0)
y = [1]*5000 + [0]*5000

In [None]:
from sklearn.utils import shuffle
x,  y = shuffle(x,  y, random_state=0)

In [None]:
s =  x.sum(1,keepdims=True)*1.0 
x = x/ s

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib import figure
# from sklearn.manifold import TSNE

# X_tsne = TSNE(learning_rate=100).fit_transform(x)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()

In [None]:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
import sklearn
from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = sklearn.linear_model.LogisticRegression(random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_3node_LR = []
for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    print i,acc
    predictions_3node_LR+= list(predicted)
print "acc 5-CV = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
#Random classification
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.dummy import DummyClassifier as baseline

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = baseline(strategy='most_frequent', random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_majority =[]

for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    predictions_majority += list(predicted)
    print i,acc
print "acc 5-Majority = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
from scipy.stats import ttest_ind as ttest
import numpy as np
def significant_test(predictions_1, predictions_2):
    predictions_1 = np.array(predictions_1)
    predictions_2 = np.array(predictions_2)
    _,p_value = ttest(predictions_1,predictions_2)
    if p_value < 0.01:
        print "significant with p_value < 0.01"
    elif p_value < 0.05:
        print "significant with p_value < 0.05"
    else:
        print "NOT significant"
    return p_value
significant_test(predictions_majority,predictions_3node_LR)

In [None]:
## comute 4-node count
output = hansard_experiment(num_nodes=4)
# with open('./final_pattern_count_4_4000.pkl','wb') as h:
#     cpickle.dump(output,h)

In [None]:
#load 4node count
with open('./final_pattern_count_4_4000.pkl','r') as h:
    output = cpickle.load(h)

In [None]:
pc_original, can_map_original, all_count_matrices_original = \
    output[0][1],output[0][2],output[0][3]

pc_trans, can_map_trans, all_count_matrices_trans = \
    output[1][1],output[1][2],output[1][3]

In [None]:
four_nodes_original = all_count_matrices_original[4]
#three_nodes_original.display_patterns(can_map_original)

In [None]:
four_nodes_trans = all_count_matrices_trans[4]
#three_nodes_trans.display_patterns(can_map_trans)

In [None]:
#x_original = softmax(four_nodes_original.count_matrix)
x_original = four_nodes_original.count_matrix

In [None]:
#x_trans = softmax(four_nodes_trans.count_matrix)
x_trans = four_nodes_trans.count_matrix

In [None]:
x =  np.concatenate((x_original, x_trans), axis=0)
y = [1]*5000 + [0]*5000

In [None]:
from sklearn.utils import shuffle
x,  y = shuffle(x,  y, random_state=0)

In [None]:
s = x.sum(1,keepdims=True)*1.0
x = x/ s

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib import figure
# from sklearn.manifold import TSNE

# X_tsne = TSNE(learning_rate=100).fit_transform(x)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()

In [None]:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
import sklearn
from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = sklearn.linear_model.LogisticRegression(random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_4node_LR = []
for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    print i,acc
    predictions_4node_LR += list(predicted)
print "acc 5-CV = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
#Random classification
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.dummy import DummyClassifier as baseline

random_state = np.random.RandomState(0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

classifier = baseline(strategy='most_frequent', random_state=random_state)

from sklearn.model_selection import cross_val_predict
mean_accs = []
predictions_majority =[]

for i in range(5):
    predicted = cross_val_predict(classifier, x, y , cv=cv.split(x, y))
    acc = accuracy_score(y, predicted)
    mean_accs.append(acc)
    predictions_majority += list(predicted)
    print i,acc
print "acc 5-Majority = %.2f%%"%(100*np.mean(mean_accs))

In [None]:
significant_test(predictions_3node_LR,predictions_4node_LR)