# NOTEBOOK : The Eternals Donkey of ML

To run this notebook read README.md

## 1. Preparation

### 1.1. Import packages

In [None]:
import numpy as np
import pandas as pd 
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.preprocessing import QuantileTransformer
import ast
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

!pip install stellargraph[gpu]

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph

from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
 
lemmatizer = WordNetLemmatizer()

### 1.2. Define constants

In [None]:
# DATA
mediane = 6

# ABSTRACT EMBEDDINGS
NB_FEATURES_ABSTRACTS = 512
METHOD = np.mean # to combine several abstract embeddings

# RANDOM WALKS
LEN_WALK = 25
NB_WALK_PER_NODE = 20
USESTD = True
USEMEAN = True
QUANTILES = [0.1,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.9]

# STORAGE
FILE_FEATURES_GRAPH    = "features_graph"
FILE_FEATURES_ABSTRACT = "features_abstracts"
FILE_DOCUMENT          = "documents"
FILE_ABSTRACTS         = "abstracts_text"
FILE_EMBEDDINGS        = "embeddings"

# Network
USE_NORMALIZER = True
USE_LINEAR_NORMALIZER_Y = True
NB_EPOCHS = 30
BATCH_SIZE = 64
LR = 0.0002

In [None]:
NB_FEATURES_GRAPH = 3 + 2*USESTD + 2*USEMEAN + 2*len(QUANTILES)
NB_FEATURES = NB_FEATURES_GRAPH + NB_FEATURES_ABSTRACTS

### 1.3. Read data

In [None]:
# read training data
df_train = pd.read_csv('/kaggle/input/inf554-2021/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n = df_train.shape[0]
print("n =", n)

# read test data
df_test = pd.read_csv('/kaggle/input/inf554-2021/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]
print("n_test =", n_test)

## 2. Features selection

### 2.1. Vectorization of abstacts

#### 2.1.1. Preprocessing of abstracts

This function converts the abstracts from the inverted index format to readable text

In [None]:
def rearrange_text(inverted):
    liste = ["" for k in range(inverted['IndexLength'])]
    
    for k in inverted['InvertedIndex']:
        for i in inverted['InvertedIndex'][k]:
            liste[i] = k
    text = ""
    for k in liste:
        text += k
        text += " "
    return text[:-1]

In [None]:
def compute_abstracts():
    abstract = {}
    with open('/kaggle/input/abstracts/abstracts.txt') as f:
        for k in tqdm(f.readlines()):
            part = k.split('----')
            try :
                abstract[int(part[0])] = ast.literal_eval(part[1])
                abstract[int(part[0])] = rearrange_text(abstract[int(part[0])])
            except:
                continue

In [None]:
def save_abstracts(abstracts):
    with open(FILE_ABSTRACTS + '.pkl', 'wb') as f:
        pickle.dump(abstracts, f)

In [None]:
def load_abstracts(name):
    documabstractsents = None
    with open(name + '.pkl', 'rb') as f:
        abstracts = pickle.load(f)
    return abstracts

In order to not compute every time :

In [None]:
if os.path.isfile("/kaggle/input/saved-data/" + FILE_ABSTRACTS + '.pkl'):
    abstract = load_abstracts("/kaggle/input/saved-data/" + FILE_ABSTRACTS)
    print("Abstracts loaded from saved dataset")
elif os.path.isfile(FILE_ABSTRACTS + '.pkl'):
    abstract = load_abstracts(FILE_ABSTRACTS)
    print("Abstracts loaded from temporary save")
else:
    abstract = compute_abstracts()
    save_abstracts(abstract)

This creates a Hashmap associating an author id to the list of papers id that he wrote

In [None]:
authors = {}
with open('/kaggle/input/inf554-2021/author_papers.txt') as f:
    for k in tqdm(f.readlines()):
        part = k.split(':')
        authors[int(part[0])] = [int(i) for i in part[1].split('-')]

This creates a hashmap of associating a paper id to its position in ```X_paper```

In [None]:
df = pd.read_csv('/kaggle/input/inf554-2021/train.csv', dtype={'author': np.int64, 'hindex': np.float32})

X_paper = []
papers = {}

j = 0
for i,row in tqdm(df.iterrows()):
    author = int(row["author"])
    if author in authors:
        for paper in authors[author]:
            if paper in abstract:
                X_paper.append(abstract[paper])
                papers[paper] = j
                j += 1

Cleans the abstracts from special characters :

In [None]:
documents = []

def compute_documents():
    documents = []
    stemmer = WordNetLemmatizer()

    for sen in tqdm(range(0, len(X_paper))):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X_paper[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
        
    return documents

In [None]:
def save_documents(documents):
    with open(FILE_DOCUMENT + '.pkl', 'wb') as f:
        pickle.dump(documents, f)

In [None]:
def load_documents(name):
    documents = None
    with open(name + '.pkl', 'rb') as f:
        documents = pickle.load(f)
    return documents

In [None]:
if os.path.isfile("/kaggle/input/saved-data/" + FILE_DOCUMENT + '.pkl'):
    documents = load_documents("/kaggle/input/saved-data/" + FILE_DOCUMENT)
    print("Documents loaded from saved dataset")
elif os.path.isfile(FILE_ABSTRACTS + '.pkl'):
    documents = load_documents(FILE_DOCUMENT)
    print("Documents loaded from temporary save")
else:
    documents = compute_documents()
    save_documents(documents)

#### 2.1.2. Word2Vec

Here we will use Word2Vec to transform a word into a vector of 300 components.

To get the embedding of an abstract we simply compute the mean of the embeddigns of word in the abstract.

then for an autour we have to merge the embeddings of each abstracts. The method of merging can be chosen thanks to ```METHOD```. The best one seems to be the mean

In [None]:
def embed_sentence(sentence, model):
    vectors = []
    for w in sentence:
        if w in vocab:
            vectors.append(model[w])
    return np.mean(vectors, axis=0)

To improve the performance of word2vec, we remove stopwords (common words such as *is*, *the*, ...) and lemmatize the words.

In [None]:
# Source: stackoverflow
# method to clean the reviews, tokenize, remove stop words and lemmatize them.
def embed(list_sentence, model):
    X_embeded = np.zeros((len(list_sentence), 300))

    for (index,sent) in enumerate(list_sentence):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #stop words removal
        omit_words = set(stopwords.words('english'))
        words = [x for x in words if x not in omit_words]
        
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
        X_embeded[index] = embed_sentence(lemma_words, model)

    return X_embeded

In [None]:
def create_features_abstract(df, start=0):
    total_rows = len(df.index)
    X_emb = np.zeros((total_rows, NB_FEATURES_ABSTRACTS))

    for j,row in tqdm(df.iterrows()):
            i = j - start
            author = row['author']
            list_papers  = authors[author]
            list_indexes = [papers[p] for p in list_papers if p in papers]
            list_papers = [X_paper[i] for i in list_indexes]
            list_embeddings = embed(list_papers, model)
            embeding = METHOD(np.array(list_embeddings), axis=0)
            X_emb[i] = embeding

    return X_emb

Run this if you already have the folder of embeddings computed :

In [None]:
X_emb_new = np.load("../input/saved-data/embeddings_new.npy")
X_test_emb_new = np.load("./embeddings_test_new.npy")

If you have not the files, you need to load the model and then compute the embeddings (loading the model can take up to 30 minutes)

For Word2Vec, we used a pretrained model as it took more than 100hours to train on our abstracts.

The model used was trained on 5.8billions token (LexVec).

It can be found here : https://github.com/alexandres/lexvec


In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('../input/word2vec/vectors.txt', binary=False)
vocab = model.key_to_index 

This is just a test to try out Word2Vec :

In [None]:
from scipy import spatial
a = model["king"] - model["man"] + model["woman"]
b = model["queen"]
sim = 1 - spatial.distance.cosine(a, b)
print(sim)

To compute the embeddings :

In [None]:
NB_FEATURES_ABSTRACTS = 300
X_test_emb_new = create_features_abstract(df_test)
X_test_emb = create_features_abstract(df_train)

And to save them :

In [None]:
np.save("embeddings_new.npy", X_emb_new)
np.save("embeddings_test_new.npy", X_test_emb_new)

### 2.1.3. TFIDF

In [None]:
def compute_embeddings(documents):
    tfidfconverter = TfidfVectorizer(max_features=NB_FEATURES_ABSTRACTS, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    X_paper_vect = tfidfconverter.fit_transform(documents).toarray()
    return X_paper_vect

In [None]:
def save_embeddings(embeddings):
    np.save(FILE_EMBEDDINGS + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy", embeddings)

In [None]:
def load_embeddings(name):
    return np.load(name + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy")

In [None]:
if os.path.isfile("/kaggle/input/saved-data/" + FILE_EMBEDDINGS + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy"):
    X_paper_vect = load_embeddings("/kaggle/input/saved-data/" + FILE_EMBEDDINGS)
    print("Embeddings loaded from saved dataset")
elif os.path.isfile(FILE_EMBEDDINGS + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy"):
    X_paper_vect = load_embeddings(FILE_EMBEDDINGS)
    print("Embeddings loaded from temporary save")
else:
    X_paper_vect = compute_embeddings(documents)
    save_embeddings(X_paper_vect)

This performs the operation of switching from features computed per abstracts to feature computed per author

i.e. Assign papers to authors and compute the mean

In [None]:
def create_features_abstract(df, start=0):
    total_rows = len(df.index)
    X_emb = np.zeros((total_rows, NB_FEATURES_ABSTRACTS))

    for j,row in tqdm(df.iterrows()):
            i = j - start
            author = row['author']
            list_papers  = authors[author]
            list_indexes = [papers[p] for p in list_papers if p in papers]
            list_embeddings = [X_paper_vect[ind] for ind in list_indexes]
            embeding = METHOD(np.array(list_embeddings), axis=0)
            X_emb[i] = embeding

    return X_emb

In [None]:
def save_features_abstract(name, dict_data):
    np.save(name + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy", dict_data)

In [None]:
def load_features_abstract(name):
    return np.load(name + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy", allow_pickle=True).item()

In [None]:
if os.path.isfile("/kaggle/input/saved-data/" + FILE_FEATURES_ABSTRACT + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy"):
    data = load_features_abstract("/kaggle/input/saved-data/" + FILE_FEATURES_ABSTRACT)
    X_emb = data["X_emb"]
    X_test_emb  = data["X_test_emb"]
    print("Embeddings loaded from saved dataset")
elif os.path.isfile(FILE_FEATURES_ABSTRACT + "_" + str(NB_FEATURES_ABSTRACTS) + ".npy"):
    data = load_features_abstract(FILE_FEATURES_ABSTRACT)
    X_emb = data["X_emb"]
    X_test_emb  = data["X_test_emb"]
    print("Embeddings loaded from temporary save")
else:
    # X_emb
    X_emb = create_features_abstract(df_train, start=0)
    
    # X_test_emb
    X_test_emb = create_features_abstract(df_test, start=0)
    
    # Save data
    dict_data = {"X_emb": X_emb, "X_test_emb": X_test_emb}
    save_features_abstract(FILE_FEATURES_ABSTRACT, dict_data)

### 2.1.4. Glove

In [None]:
# source : stackoverflow

def compute_embeddings_glove(documents):
    tfidfconverter = TfidfVectorizer(max_features=NB_FEATURES_ABSTRACTS-200, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
    X_paper_vect = tfidfconverter.fit_transform(documents).toarray()
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(documents)

    train_sequences = tokenizer.texts_to_sequences(documents)
    train_data = pad_sequences(train_sequences, maxlen=500)
    EMBEDDING_FILE = '../input/glove6b/glove.6B.200d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    embedding_matrix = np.zeros((50000, 200))
    for word, index in tokenizer.word_index.items():
        if index > 50000 - 1:
            break
        else:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
                
    return X_paper_vect, train_data, embedding_matrix
def compute_embeddings_glove_idf(train_data, embedding_matrix):
    def batch(iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            yield iterable[ndx:min(ndx + n, l)]
    X_paper_vect = tf.Variable(tf.zeros([len(train_data), 200]))
    l = len(documents)
    for i, document in tqdm(enumerate(batch(train_data, 128))):
        X_paper_vect[i*128:min(l,(i+1)*128)].assign(tf.math.reduce_mean(Embedding(50000, 200, input_length=500, weights=[embedding_matrix], trainable=False)(document), axis  = 1))
    
    return X_paper_vect.numpy()

### 2.2. Vectorization of the graph

#### 2.2.1. Node2Vec

The model was trained on the other notebook.

We load it

In [None]:
# Load pre-trained Word2Vec model.
import gensim
model_n2v = gensim.models.Word2Vec.load("../input/saved-data/node2vec.model")

Then compute the embeddings for the train set :

In [None]:
author_ids_train = [df_train.to_numpy()[i][0] for i in range(n)]

In [None]:
X_n2v = []
for id in tqdm(author_ids_train[1:]):
    X_n2v.append(model_n2v.wv[str(int(id))])
X_n2v = np.array(X_n2v)

And compute the embeddings for the test set

In [None]:
author_ids_test = [df_test.to_numpy()[i][1] for i in range(n_test)]

In [None]:
X_test_n2v = []
for id in tqdm(author_ids_test):
    X_test_n2v.append(model_n2v.wv[str(int(id))])
X_test_n2v = np.array(X_test_n2v)

As this is really fast to compute, we will not save it.

### 2.2.2. Quantile and Variance with random walk

Here we invented a method :

For each author, we do ```NB_WALK_PER_NODE``` random walks of length ```LEN_WALK```.

We store the *h_index* of each visited node in an array. Then here are the features :
- ```nb_quantiles``` of quantiles of the array created in the random walk
- the number of unique author visited in the random walk
- the variance of the h-index in the random walk
- the mean of the h-index in the random walk

An we also add features from the node and its neighbors :
- degree of the node
- core number of the node
- ```nb_quantiles``` of quantiles of the array of h-index of its neighbors
- the mean of the h-index of its neighbors

In the end this gave us 33 features.

Let's start by loading the graph and computing the core numbers :

In [None]:
# load the graph    
G = nx.read_edgelist('/kaggle/input/inf554-2021/coauthorship.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)


# computes structural features for each node
core_number = nx.core_number(G)

We create the object for the random walk

In [None]:
rw = BiasedRandomWalk(StellarGraph.from_networkx(G))

And we assign to each node in the graph (an author), its h-index

In [None]:
for i,row in df_train.iterrows():
    node = row['author']
    G.nodes[node]["label"] = row['hindex']

This creates a Hashmap associating the id of an author to its h-index

In [None]:
hindex_from_author = {}

for i,row in tqdm(df_train.iterrows()):
    hindex_from_author[row["author"]] = row["hindex"]

author_with_hindex_unknown = 0
for node in tqdm(G.nodes()):
    if not(node in hindex_from_author):
        hindex_from_author[node] = mediane
        author_with_hindex_unknown += 1
        
percentage_author_with_hindex_unknown = np.round(100 * author_with_hindex_unknown / n_nodes, 2)
print("Il y a", percentage_author_with_hindex_unknown, "% d'auteurs sans h_index")

This is what creates the features, computing the random walk

It will take several hours

In [None]:
def create_features_graph(df, build_y = True, start=0):
    list_quantiles = QUANTILES
    total_rows = len(df.index)
    nb_quantiles = len(list_quantiles)
    use_std = USESTD
    use_mean = USEMEAN
    nb_features = NB_FEATURES_GRAPH

    print("=======================")
    print("LEN_WALK         =", LEN_WALK)
    print("NB_WALK_PER_NODE =", NB_WALK_PER_NODE)
    print("=======================")

    print("\n=====================")
    print("NB quantiles =", nb_quantiles)
    print("Use STD      =", use_std)
    print("Use Mean     =", use_mean)
    print("NB features  =", nb_features)
    print("=====================")

    print("\nN =", total_rows)


    X = np.zeros((total_rows, nb_features))
    if build_y: y = np.zeros(total_rows)
    
    for j,row in tqdm(df.iterrows()):
        i = j - start
        node = row['author']
    
        # ====================== RANDOM WALKS ====================== #
        walks = rw.run(
              nodes=[node], # root nodes
              length=LEN_WALK,  # maximum length of a random walk
              n=NB_WALK_PER_NODE,        # number of random walks per root node 
              p=0.5,       # Defines (unormalised) probability, 1/p, of returning to source node
              q=2.0        # Defines (unormalised) probability, 1/q, for moving away from source node
        )

        # Build list of h-index visited
        visited = set()
        list_h_index = []
        count_visited = 0
        for walk in walks:
            for node_visited in walk:
                if node != node_visited:
                    hindex_visited = hindex_from_author[node_visited]
                    list_h_index.append(hindex_visited)
                    if not(node_visited in visited):
                        visited.add(node_visited)
                        count_visited += 1
        list_h_index = np.array(list_h_index)

        # Metrics used
        quantiles = np.quantile(list_h_index, list_quantiles)
        std = np.std(list_h_index)
        mean = np.mean(list_h_index)
        count_visited /= LEN_WALK * NB_WALK_PER_NODE
        # =========================================================== #


        # ====================== NEIGHBORS ====================== #
        # Build list of h-index Neighbors
        list_h_index = [G.nodes[n]["label"]  if "label" in G.nodes[n] else mediane for n in G.neighbors(node)]
        if len(list_h_index) == 0: list_h_index.append(mediane)
        list_h_index = np.array(list_h_index)

        # Metrics used
        quantiles_neigh = np.quantile(list_h_index, list_quantiles)
        std_neigh = np.std(list_h_index)
        mean_neigh = np.mean(list_h_index)
        # ======================================================= #


        # ====================== ADD FEATURES TO XTRAIN ====================== #
        X[i,0] = count_visited        # ratio of uniques node visited during random walk
        X[i,1] = G.degree(node)       # Nb of neighbors
        X[i,2] = core_number[node]    # Core number (don't know what it is)
        for j in range(nb_quantiles):
            X[i,3+j] = quantiles[j]
            X[i,3+j+nb_quantiles] = quantiles_neigh[j]
        if use_std:
            X[i,3 + 2*nb_quantiles] = std
            X[i,4 + 2*nb_quantiles] = std_neigh
        if use_mean:
            X[i,3 + 2*use_std + 2*nb_quantiles] = mean
            X[i,4 + 2*use_std + 2*nb_quantiles] = mean_neigh
        # ==================================================================== #


        # ====================== ADD YTRAIN ====================== #
        if build_y: y[i] = row['hindex']
            
    if build_y: return X,y
    return X

In [None]:
def save_features_graph(name, dict_data):
    np.save(name + ".npy", dict_data)

In [None]:
def load_features_graph(name):
    return np.load(name + ".npy", allow_pickle=True).item()

This is a helper function to load the features if they were saved (from local files of our dataset).

Otherwise it performs the heavy computation

In [None]:
if os.path.isfile("/kaggle/input/saved-data/" + FILE_FEATURES_GRAPH + ".npy"):
    data = load_features_graph("/kaggle/input/saved-data/" + FILE_FEATURES_GRAPH)
    X = data["X"]
    y = data["y"]
    X_test  = data["X_test"]
    print("Graph features loaded from temporary save")
elif os.path.isfile(FILE_FEATURES_GRAPH + ".npy"):
    data = load_features_graph(FILE_FEATURES_GRAPH)
    X = data["X"]
    y = data["y"]
    X_test  = data["X_test"]
    print("Graph features loaded from temporary save")
else:
    # X, y
    X, y = create_features_graph(df_train, build_y = True, start=0)
    
    # X_test
    X_test = create_features_graph(df_test, build_y = False, start=0)
    
    # Save data
    dict_data = {"X": X, "y": y, "X_test": X_test}
    save_features_graph(FILE_FEATURES_GRAPH, dict_data)

## 3. Clean Data

### 3.1. Cleaning Nans

In [None]:
# Clean embed nan
X_emb_new[np.isnan(X_emb_new)] = 0
mean = np.true_divide(X_emb_new.sum(0),(X_emb_new!=0).sum(0))

#X[np.isnan(X)] = 0
#for i in tqdm(range(len(X))):
#    if np.sum(np.abs(X[i])) < 1e-6:

In [None]:
mean.shape

In [None]:
mean = np.true_divide(X_concat.sum(0),(X_concat!=0).sum(0))

In [None]:
X_concat[np.isnan(X_concat)] = 0
for i in tqdm(range(len(X_concat))):
    if np.sum(np.abs(X_concat[i])) < 1e-6:
        X_concat[i] = mean
#X_test_emb_new[X_test_emb_new==0] = mean
X_concat[:5]

In [None]:
X_test_concat[np.isnan(X_test_concat)] = 0
for i in tqdm(range(len(X_test_concat))):
    if np.sum(np.abs(X_test_concat[i])) < 1e-6:
        X_test_concat[i] = mean
#X_test_emb_new[X_test_emb_new==0] = mean
X_test_concat[:5]

In [None]:
mmm = np.mean(X_test_concat, axis=0)
print(mmm.shape)
print(X_test_concat.shape)
X_test_concat = np.concatenate((X_test_concat, mmm.reshape(1,-1)), axis=0)

In [None]:
mmm2 = np.mean(X_concat, axis=0)
print(mmm2.shape)
print(X_concat.shape)
X_concat = np.concatenate((X_concat, mmm2.reshape(1,-1)), axis=0)

In [None]:

print("X_test_n2v", X_test_n2v.shape)
print("X_test", X_test.shape)
print("X_test_emb_new", X_test_emb_new.shape)
print("X_test_emb", X_test_emb.shape)

print("X_n2v", X_n2v.shape)
print("X", X.shape)
print("X_emb_new", X_emb_new.shape)
print("X_emb", X_emb.shape)

In [None]:
print("X node2vec shape:     ", X_n2v.shape)
print("X graph shape:     ", X.shape)
print("X embeddings shape:", X_emb_new.shape)
X_concat = np.concatenate((X_n2v, X, X_emb_new, X_emb), axis=1)
X_test_concat = np.concatenate((X_test_n2v, X_test, X_test_emb_new, X_test_emb), axis=1)
print("X concat shape:    ", X_concat.shape)

In [None]:
X_concat[:2]

### 3.2. Checking there is no Nans

In [None]:
np.count_nonzero(np.isnan(X_concat))

In [None]:
np.count_nonzero(np.isnan(X_test_concat))

### 3.3. Splitting

In [None]:
y = np.zeros(len(df_train.index))
for j,row in tqdm(df_train.iterrows()):
    y[j] = row['hindex']

In [None]:
NB_FEATURES = 300 + 33 + 128

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_concat, y, test_size=0.01, random_state=0)

In [None]:
X_train[0]

### 3.4. Normalize

In [None]:
mean_train = np.mean(X_train, axis=0)
std_train = np.std(X_train, axis=0) + 1e-7

def transform(X):
    return (X - mean_train) / std_train

In [None]:
mean_train_Y = np.median(y_train, axis=0)
std_train_Y  = np.std(y_train, axis=0) + 1e-7

def transformYLinear(Y):
    return (Y - mean_train_Y) / std_train_Y

In [None]:
std_train

In [None]:
def inverseTransformYLinear(Y):
    return Y * std_train_Y + mean_train_Y

In [None]:
qt = QuantileTransformer(n_quantiles=10, random_state=0)
qt.fit_transform(y_train.reshape(-1,1))

def transformYQuantile(Y):
    return qt.transform(Y)

In [None]:
def inverseTransformYQuantile(Y):
    return qt.inverse_transform(Y)

In [None]:
transformY = transformYLinear
inverseTransformY = inverseTransformYLinear
if not(USE_LINEAR_NORMALIZER_Y):
    transformY = transformYQuantile
    inverseTransformY = inverseTransformYQuantile

In [None]:
if USE_NORMALIZER:
    X_train = transform(X_train)
    X_eval  = transform(X_eval)
    X_test_concat  = transform(X_test_concat)
    y_train = transformY(y_train.reshape(-1,1))
    y_eval  = transformY(y_eval.reshape(-1,1))

## 4. Network

### 4.1. Definition

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(128, activation='tanh', kernel_initializer="uniform", input_shape=(NB_FEATURES + 512,)))
model.add(Dense(128, activation='tanh', kernel_initializer="uniform"))
model.add(Dense(64, activation='selu'))
model.add(Dense(1, activation='linear'))

opt = tf.keras.optimizers.SGD(lr=5*LR, momentum=0.8)#tf.keras.optimizers.Adam(learning_rate=LR)
model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_squared_error'])
print(model.summary())

### 4.2. Train

In [None]:
opt.lr.assign(0.5*LR)

In [None]:
model.fit(X_train, y_train, validation_data=(X_eval, y_eval), epochs=10, batch_size=BATCH_SIZE)

### 4.3. Evaluate

In [None]:
y_eval_pred = inverseTransformY(model(X_eval))
y1 = np.array([np.round(x[0]) for x in list(np.array(y_eval_pred))])
y1[y1 <1] = 1
y2 = np.array([np.round(x[0]) for x in list(inverseTransformY(y_eval))])
acc = np.mean((y1-y2)**2)

print("Predicted:", y1[:20])
print("Ground truth:", y2[:20])
print("Accuracy:", acc)

### 4.4. Computing test

In [None]:
# PREDICTING TEST VALUES (for leaderboard)
y_test_pred = inverseTransformY(model(X_test_concat))
y_pred = np.array([np.round(x[0]) for x in list(np.array(y_test_pred))])
y_pred[y_pred <1] = mediane
print(y_pred[:20])

# write the predictions to file
df_test['hindex'] = pd.Series(np.round_(y_pred, decimals=3))
df_test.loc[:,["author","hindex"]].to_csv('submission.csv', index=False)

print("Y shape:", y_pred.shape)
print("Y pred:", y_pred[:20])