In [2]:
import networkx as nx
import csv
import numpy as np
from sklearn.utils import shuffle
import tensorflow as tf


In [3]:
G=nx.read_edgelist('datasets/edgelist.txt', delimiter=',',create_using=nx.Graph(),nodetype=int)
nodes=list(G.nodes())
n=G.number_of_nodes()
m=G.number_of_edges()

print('Number of nodes:', n)

print('Number of edges:', m)


Number of nodes: 138499
Number of edges: 1091955


In [4]:
abstracts=dict()
with open('datasets/abstracts.txt', 'r',encoding="utf-8") as f:
    for line in f:
        node,abstract=line.split('|--|')
        abstracts[int(node)]=abstract

In [5]:

def initialize_x_train(number_of_edges,list_of_features,mypath):
    mypath=mypath
    number_of_features=len(list_of_features)
    print("number_of_edges:",number_of_edges)
    #mul by 2 for the training matrix
    x=np.zeros((2*number_of_edges,number_of_features))
    for idx,feature in enumerate(list_of_features):
        print("loading column {} with feature {}".format(idx,feature))
        x[:,idx]=np.genfromtxt(mypath+feature,delimiter=',')
    return x

In [6]:
def initialize_x_test(number_of_edges,list_of_features,mypath):
    mypath=mypath
    number_of_features=len(list_of_features)
    print("number_of_edges:",number_of_edges)
    #mul by 1 for the training matrix
    x=np.zeros((number_of_edges,number_of_features))
    for idx,feature in enumerate(list_of_features):
        print("loading column {} with feature {}".format(idx,feature))
        x[:,idx]=np.genfromtxt(mypath+feature,delimiter=',')
    return x

In [7]:
from os import listdir
from os.path import isfile,join
def get_feature_files(mypath):
    mypath=mypath
    features_to_include=[feature for feature in listdir(mypath)]
    print("included features are: ",features_to_include)
    return features_to_include
    

In [9]:
mypath='datasets/features_train/'
X_train=initialize_x_train(m,get_feature_files(mypath),mypath)

included features are:  ['adamic_adar_index.csv', 'common_elements_in_abstracts.csv', 'difference_of_abstracts_len.csv', 'jaccard_coef.csv', 'preferential_attachment.csv', 'resource_allocation_index.csv', 'sum_of_abstracts_len.csv']
number_of_edges: 1091955
loading column 0 with feature adamic_adar_index.csv
loading column 1 with feature common_elements_in_abstracts.csv
loading column 2 with feature difference_of_abstracts_len.csv
loading column 3 with feature jaccard_coef.csv
loading column 4 with feature preferential_attachment.csv
loading column 5 with feature resource_allocation_index.csv
loading column 6 with feature sum_of_abstracts_len.csv


In [10]:
y_train=np.genfromtxt('datasets/Y_train.csv',delimiter=',')

In [11]:
X_train_m,y_train_m=shuffle(X_train,y_train)

In [12]:
node_pairs = list()
with open('datasets/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        #the raw split form is ['12223', '345332\n']
        #we need to make them integers
        #use the function int()
        node_pairs.append((int(t[0]), int(t[1])))

In [13]:
mypath='datasets/features_test/'
X_test=initialize_x_test(len(node_pairs),get_feature_files(mypath),mypath)

included features are:  ['adamic_adar_index_test.csv', 'common_elements_in_abstracts_test.csv', 'difference_of_abstracts_test.csv', 'jaccard_coefficient_test.csv', 'preferential_attachment_test.csv', 'resource_allocation_index_test.csv', 'sum_of_abstracts_len_test.csv']
number_of_edges: 106692
loading column 0 with feature adamic_adar_index_test.csv
loading column 1 with feature common_elements_in_abstracts_test.csv
loading column 2 with feature difference_of_abstracts_test.csv
loading column 3 with feature jaccard_coefficient_test.csv
loading column 4 with feature preferential_attachment_test.csv
loading column 5 with feature resource_allocation_index_test.csv
loading column 6 with feature sum_of_abstracts_len_test.csv


In [14]:
from keras import models,layers

model=models.Sequential()
model.add(layers.Dense(32,activation='relu',input_shape=(7,)))
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))

In [15]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [16]:
history=model.fit(X_train,y_train,epochs=5,batch_size=512)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [51]:
y_pred=model.predict(X_test)
print(y_pred[:20,0])

[9.09372866e-01 1.09039094e-04 8.42505813e-01 4.29038167e-01
 2.31334865e-02 6.76827610e-01 4.02840257e-01 4.35677916e-01
 3.20359230e-01 8.95888805e-01 4.05323595e-01 5.93736649e-01
 7.79803991e-01 2.65132725e-01 4.59791809e-01 4.54013556e-01
 5.31662881e-01 4.68701720e-01 2.13773966e-01 3.41799855e-02]


In [52]:
import csv
predictions=list(zip(range(len(y_pred)),y_pred[:,0]))
with open("submission_keras1noscale.csv", 'w') as outfile:
    csv_out=csv.writer(outfile)
    csv_out.writerow(['id','predicted'])

    for row in predictions: 
        csv_out.writerow(row)

In [1]:
import tensorflow_hub as hub
import tensorflow_text as text
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
text_input=tf.keras.layers.Input(shape=(),dtype=tf.string,name='text')
preprocessed_text=bert_preprocess

In [4]:
text_test=['this is a good movie']
text_preprocessed=bert_preprocess(text_test)
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_type_ids', 'input_mask', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 1037 2204 3185  102    0    0    0    0    0]
Input Mask : [1 1 1 1 1 1 1 0 0 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [13]:
bert_model=tf_hub.KerasLayer(bert_encoder)
bert_results=bert_model(text_preprocessed)
print(f'Loaded BERT: {bert_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: <tensorflow_hub.keras_layer.KerasLayer object at 0x00000101EB7CC670>
Pooled Outputs Shape:(1, 768)
Pooled Outputs Values:[-0.8787369  -0.28986096  0.21888238  0.6509246  -0.05978949 -0.08652615
  0.8549417   0.19872479 -0.0367921  -0.9998593  -0.00700509  0.4182273 ]
Sequence Outputs Shape:(1, 128, 768)
Sequence Outputs Values:[[ 0.05524094  0.09279849  0.18011808 ... -0.20717065  0.25018063
   0.10174131]
 [-0.5319642  -0.415677    0.28602126 ... -1.082092    1.0272561
   0.15845591]
 [ 0.06295532 -0.500933    0.52670294 ... -0.5936099   0.5590759
   0.6334749 ]
 ...
 [ 0.02546454 -0.42473173  0.4675644  ...  0.20326579  0.40612566
  -0.18301186]
 [ 0.07985108 -0.46011198  0.4119412  ...  0.16304101  0.42579618
  -0.37503162]
 [ 0.08701746 -0.43609232  0.4363102  ...  0.20113279  0.4420772
  -0.23319733]]


In [20]:
def build_classifier_model():
    text_input=tf.keras.layers.Input(shape=(),dtype=tf.string,name='text')#our abstract
    preprocessing_layer=tf_hub.KerasLayer(bert_preprocess,name='preprocessing')
    encoder_inputs=preprocessing_layer(text_input)
    encoder=tf_hub.KerasLayer(bert_encoder,trainable=True,name='BERT_encoder')
    outputs=encoder(encoder_inputs)
    net=outputs['pooled_output']
    net=tf.keras.layers.Dropout(0.1)(net)
    net=tf.keras.layers.Dense(1,activation=None,name='classifier')(net)
    return tf.keras.Model(text_input,net)

In [21]:
classifier=build_classifier_model()

ERROR:absl:hub.KerasLayer is trainable but has zero trainable weights.


In [22]:
bert_raw_result=classifier(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.73604375]], shape=(1, 1), dtype=float32)


In [24]:
from pytorch_pretrained_bert import BertModel   
from pytorch_pretrained_bert import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 652928.44B/s]


In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))
