1. Tokenize our data using google's BERT

In [2]:
# Classic imports

# !pip install bert-for-tf2
# !pip install sentencepiece

import pandas as pd
import bert
import re
import numpy as np
import random
import math

In [None]:
# Only if using colab
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# The absolute path to the directory containing all project files
path = "C:/Users/Marie/Organisation_Marie/X/3A/INF 554 - Machine Learning/Project/BERT/Clean/" 

In [None]:
# Getting our dataset into memory ; columns are : number, author, concatenation of abstracts, h index.
raw_data = pd.read_csv(path+"Abstracts Dataset.csv")
raw_data.isnull().values.any()

In [None]:
# Pre-processing the abstracts : removing special characters, etc
def preprocess_text(sen):
    sentence = re.sub('[^a-zA-Z]', ' ', sen) # Remove punctuations and numbers
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) # Single character removal
    sentence = re.sub(r'\s+', ' ', sentence) # Removing multiple spaces, just in case
    return sentence

preprocessed_abstracts = []
abstracts = list(raw_data['abstracts'])
for sen in abstracts:
    preprocessed_abstracts.append(preprocess_text(sen))

In [None]:
# Recovering the h indexes and the authors
h_indexes = raw_data['hindex']
authors = raw_data['authorID']

In [None]:
# Getting the tokenizer object from the imports
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case) # Creating the tokenizer !

In [None]:
# Tokenizing our abstracts
def tokenize_abstract(abstract):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(abstract))

tokenized_abstracts = [tokenize_abstract(abstract) for abstract in abstracts]

In [None]:
# Now the same but with the test set, for which we do not have the h indexes
raw_test = pd.read_csv(path+"Abstracts for test.csv")
raw_test.isnull().values.any()
authors_test = raw_test["authorID"]
preprocessed_abstracts_test = []
abstracts_test = list(raw_test['abstracts'])
for sen in abstracts_test:
    preprocessed_abstracts_test.append(preprocess_text(sen))
tokenized_test_abstracts = [tokenize_abstract(abstract) for abstract in abstracts_test]

In [None]:
# Optionnal : saving the tokenized versions into memory
df = pd.DataFrame([[authors[i],tokenized_abstracts[i],h_indexes[i]] for i in range(len(h_indexes))], columns = ["authorID","tokenizedAbstract","hindex"])
df.loc[:,["authorID","tokenizedAbstract","hindex"]].to_csv(path+"tokenizedDataset.csv")
df_test = pd.DataFrame([[authors_test[i],tokenized_test_abstracts[i]] for i in range(len(authors_test))], columns = ["authorID","tokenizedAbstract"])
df_test.loc[:,["authorID","tokenizedAbstract"]].to_csv(path+"tokenizedTest.csv")

2. Process our data and use it to fit a neuronal network using tensorflow

In [3]:
# Classic imports

# !pip install tensorflow
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers

In [None]:
# First, let's batch our dataset
BATCH_SIZE = 64

abstracts_with_len = [[abstract, h_indexes[i], len(abstract), authors[i]] for i, abstract in enumerate(tokenized_abstracts)]
random.shuffle(abstracts_with_len)
abstracts_with_len.sort(key=lambda x: x[2]) # Sorting according to the length of the (concatenation of) abstract
for_fitting = [(abstract_lab[0], abstract_lab[1]) for abstract_lab in abstracts_with_len] #Keeping what is necessary for the NN fitting
corresponding_authors = [abstracts_with_len[i][3] for i in range(len(abstracts_with_len))]
processed_dataset = tf.data.Dataset.from_generator(lambda: for_fitting, output_types=(tf.int32, tf.int32))
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
batched_dataset.shuffle(math.ceil(len(for_fitting) / BATCH_SIZE))

In [None]:
# Then, let's design our neuronal network. keep in mind that it is designed to return the h index of a tokenized concatenation of astracts.

class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,vocabulary_size,embedding_dimensions=128,cnn_filters=50,dnn_units=512,dropout_rate=0.1,training=False,name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        self.embedding = layers.Embedding(vocabulary_size,embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,kernel_size=2,padding="valid",activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,kernel_size=3, padding="valid",activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,kernel_size=4,padding="valid",activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=1,activation="relu")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        return model_output

VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
DROPOUT_RATE = 0.2 # Between 0 and 1 ; the higher the value, the lesser the risk of overfitting
NB_EPOCHS = 10

In [None]:
# Creating an instance of neuronal network and fitting it on our dataset
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,embedding_dimensions=EMB_DIM,cnn_filters=CNN_FILTERS,dnn_units=DNN_UNITS,dropout_rate=DROPOUT_RATE)
text_model.compile(loss="mean_squared_error",optimizer="adam",metrics=["mean_squared_error"])
text_model.fit(batched_dataset, epochs=NB_EPOCHS)

In [None]:
# Optionnal : saving our fitted neuronal network, for futur use
text_model.save("NeuronalNetwork_"+str(NB_EPOCHS))

3. Testing our fitted neuronal network on the test set

In [None]:
# Optionnal : loading an already trained neuronal network
NN_name = "NeuronalNetwork_"+str(NB_EPOCHS)
text_model = tf.keras.models.load_model(path+NN_name)

In [None]:
# We need to batch our test set
BATCH_SIZE = 64
test_abstracts_with_len = [[abstract, len(abstract), test_authors[i]] for i, abstract in enumerate(tokenized_test_abstracts)]
random.shuffle(test_abstracts_with_len)
test_abstracts_with_len.sort(key=lambda x: x[1]) # Sorting according to the length of the (concatenation of) abstract
for_testing = [(abstract_lab[0]) for abstract_lab in test_abstracts_with_len] #Keeping what is necessary for the NN testing
corresponding_test_authors = [test_abstracts_with_len[i][2] for i in range(len(test_abstracts_with_len))]
processed_testset = tf.data.Dataset.from_generator(lambda: for_testing, output_types=(tf.int32, tf.int32))
batched_testset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, )))

In [None]:
# Using the NN to predict our testset's h indexes
res = text_model.predict(batched_testset)

In [None]:
# Optionnal : saving the results for an ulterior submission
df_res = pd.DataFrame([[corresponding_test_authors[i],res[i][0]] for i in range(len(corresponding_test_authors))],columns=["author","hindex"])
df_res.loc[:,["author","hindex"]].to_csv(path+"predictions_for_"+NN_name+".csv")

(Optionnal) 4. Getting into memory a pre-tokenized training set and test set

In [None]:
# Training set
df_train = pd.read_csv(path+"tokenizedDataset.csv")
train_np = df_train.to_numpy()
tokenized_abstracts = [train_np[i,2] for i in range(train_np.shape[0])]
authors = [train_np[i,1] for i in range(train_np.shape[0])]
h_indexes = [train_np[i,3] for i in range(train_np.shape[0])]

In [None]:
# Test set
df_test = pd.read_csv(path+"tokenizedTest.csv")
test_np = df_test.to_numpy()
tokenized_test_abstracts = [test_np[i,2] for i in range(test_np.shape[0])]
authors_test = [test_np[i,1] for i in range(test_np.shape[0])]

(Optionnal) 5. Generating the datasets with the given files

In [4]:
import json
from bisect import bisect_left
import csv
import numpy as np

purcent = 100
path = "C:/Users/Marie/Organisation_Marie/X/3A/INF 554 - Machine Learning/Project/BERT/Clean/"

# functions

def interpret(abstracts,N,purcent=100):

    def aux(i):
        if i% purcent*200 == 0 :
            print(str(int(100*100*i//(purcent*N)))+" %",end="\r")
        return json.loads(abstracts[i][int(np.log10(paper_ids[i]))+5:])

    itp_v = np.vectorize(aux)
    content = itp_v(np.arange(0,purcent*N//100,1))
    print("100 %")
    return content

def get_h_indexes():
    h_indexes = {}
    file_h_index = open(train_path,'r')
    reader = csv.reader(file_h_index)
    i = 0
    for row in reader :
        if i != 0 :
            h_indexes[row[0]]=row[1]
        if i == 0 :
            i = 1
    return h_indexes

def get_test_authors():
    h_indexes = {}
    file_h_index = open(test_path,'r')
    reader = csv.reader(file_h_index)
    i = 0
    for row in reader :
        if i != 0 :
            h_indexes[row[1]]=0
        if i == 0 :
            i = 1
    return h_indexes

def get_papers_authors():
    file_author_papers = open(author_papers_path,'r',encoding="utf-8")
    papers_authors = file_author_papers.readlines() # one line per author
    papers = {}
    for row in papers_authors :
        row = row.rstrip('\n')
        temp = row.split(":")
        if len(temp)!=2 :
            print("error")
        author = temp[0]
        papers_ = temp[1].split("-")
        papers[author]=papers_
    return papers


# getting everything in memory

abstracts_path = path+"abstracts.txt"
train_path = path+"train.csv"
author_papers_path = path+"author_papers.txt"
test_path = path+"test.csv"

file = open(abstracts_path,'r',encoding='utf-8')

abstracts = file.readlines() # list of all the "abstracts"
N = len(abstracts) # number of papers

paper_ids = np.array([int(abstracts[i].split("----")[0]) for i in range(N)])
content = interpret(abstracts,N,purcent)
h_indexes = get_h_indexes()
test = get_test_authors()
papers = get_papers_authors()


# reformatting train

list_of_authors = list(h_indexes.keys())
list_of_authors.sort()

column1 = [ "" for _ in range(len(list_of_authors))]
column2 = np.zeros(len(list_of_authors))

for i in range(len(list_of_authors)):
    
    if i % 200 == 0 :
        print(str(round(i*100/len(list_of_authors),2))+" %", end = "\r")
    
    author_id = list_of_authors[i]
    
    concatenated_abstracts = ""
    author_paper_ids = papers[str(author_id)]
    
    for j in range(len(author_paper_ids)):
        
        paper_id = int(author_paper_ids[j])
        
        index = bisect_left(paper_ids, paper_id)
        
        if index == paper_ids.size :
            continue
        
        abstract_dico = content[index]["InvertedIndex"]
        
        numbers = [e for m in abstract_dico.values() for e in m]        
        list_of_words = ["" for _ in range(max(numbers)+1)]
        
        for word in abstract_dico.keys():
            for rank in abstract_dico[word]:
                list_of_words[rank] = word+" "
        
        abstract = "".join(list_of_words)
        
        concatenated_abstracts = concatenated_abstracts + " "+abstract
    
    column1[i] = concatenated_abstracts
    column2[i] = h_indexes[str(author_id)]

print("100 %")

# reformatting test

list_of_authors_test = list(test.keys())
list_of_authors_test.sort()

column3 = [ "" for _ in range(len(list_of_authors_test))]
column4 = np.zeros(len(list_of_authors_test))

for i in range(len(list_of_authors_test)):
    
    if i % 200 == 0 :
        print(str(round(i*100/len(list_of_authors_test),2))+" %", end = "\r")
    
    author_id = list_of_authors_test[i]
    
    concatenated_abstracts = ""
    author_paper_ids = papers[str(author_id)]
    
    for j in range(len(author_paper_ids)):
        
        paper_id = int(author_paper_ids[j])
        
        index = bisect_left(paper_ids, paper_id)
        
        if index == paper_ids.size :
            continue
        
        abstract_dico = content[index]["InvertedIndex"]
        
        numbers = [e for m in abstract_dico.values() for e in m]        
        list_of_words = ["" for _ in range(max(numbers)+1)]
        
        for word in abstract_dico.keys():
            for rank in abstract_dico[word]:
                list_of_words[rank] = word+" "
        
        abstract = "".join(list_of_words)
        
        concatenated_abstracts = concatenated_abstracts + " "+abstract
    
    column3[i] = concatenated_abstracts

print("100 %")

# saving everything on harddrive

results_path_train = path+"Abstracts Dataset.csv"
results_path_test = path+"Abstracts for test.csv"

def serialize(column0,column1,column2,file_path):
    agg = [[column0[i],column1[i],column2[i]] for i in range(len(column1))]
    df = pd.DataFrame(agg,columns=["authorID","abstracts","hindex"])
    df.loc[:,["authorID","abstracts","hindex"]].to_csv(file_path)
    
serialize(list_of_authors,column1,column2,results_path_train)
serialize(list_of_authors_test,column3,column4,results_path_test)

100 %
100 % %
100 % %
