Our main idea here is to understand how words are linked to their user's h indexes.
Ideally, we would like to find words that are mostly used by scientists with very high h indexes, so that if an author of the test set uses them, we can infer that their h indexes must be high as well. However, we also need such words to be used by enough scientists, otherwise the probability of a scientist in the test set using it is very low.

1. Getting everything into memory and building a dictionnary of words

In [1]:
import numpy
import json
import pandas as pd
import re
from bisect import bisect_left
import csv
import random as rd

In [2]:
# hyper parameters

purcent = 100 # pourcentage of analysed data from abstracts.txt
author_threshold = 3 # minimal number of authors having used the word
word_length_threshold = 3 # minimal length of a word (shorter words are discarded)

path = "C:/Users/Marie/Organisation_Marie/X/3A/INF 554 - Machine Learning/Project/"

In [None]:
# getting into memory the different parts of the dataset

def interpret(abstracts,N,purcent=100):
    def aux(i):
        if i% purcent*200 == 0 :
            print(str(int(100*100*i//(purcent*N)))+" %",end="\r")
        return json.loads(abstracts[i][int(numpy.log10(paper_ids[i]))+5:])
    itp_v = numpy.vectorize(aux)
    content = itp_v(numpy.arange(0,purcent*N//100,1))
    return content

def get_content_and_paper_ids():
    file_abstracts = open(path+"abstracts.txt",'r',encoding='utf-8')

    abstracts = file_abstracts.readlines() # list of all the "abstracts"
    file_abstracts.close()
    N = len(abstracts) # total number of abstracts ; equals 624,181
    paper_ids = numpy.array([int(abstracts[i].split("----")[0]) for i in range(N)]) # the list of the abstract's IDs (already sorted)
    content = interpret(abstracts,N,purcent) # The list of abstracts as dictionnaries
    
    return paper_ids,content

def get_h_indexes():
    h_indexes = {}
    file_h_index = open(path+"train.csv",'r')
    reader = csv.reader(file_h_index)
    i = 0
    for row in reader :
        if i != 0 :
            h_indexes[row[0]]=row[1]
        if i == 0 :
            i = 1
    return h_indexes

def get_papers_authors():
    file_author_papers = open(path+"author_papers.txt",'r',encoding="utf-8")
    papers_authors = file_author_papers.readlines() # one line per author
    papers = {}
    for row in papers_authors :
        row = row.rstrip('\n')
        temp = row.split(":")
        if len(temp)!=2 :
            print("error")
        author = temp[0]
        papers_ = temp[1].split("-")
        papers[author]=papers_
    return papers

def get_reds():
    reds = []
    file_reds = open(path+"test.csv",'r')
    reader = csv.reader(file_reds)
    i = 0
    for row in reader :
        if i != 0 :
            reds.append(row[1])
        if i == 0 :
            i = 1
    return reds

paper_ids,content = get_content_and_paper_ids()
h_indexes = get_h_indexes()
papers = get_papers_authors()
reds = get_reds()

In [None]:
# Now, let's make our dictionnary of words, as a python sorted list.

def dictionnaire(content,N,purcent=100):
    words = [""]
    for i in range(purcent*N//100):
        if i% purcent*200 == 0 :
            print(str(int(100*100*i//(purcent*N)))+" %",end="\r")
        dico = content[i]
        keys = list(dico["InvertedIndex"].keys())
        for word in keys :
            word = re.sub(r"[^a-zA-Z]", "", word) # preprocessing the word
            word = word.lower()
            if len(word) > word_length_threshold :
                index = bisect_left(words, word)
                if index == len(words) or words[index] > word or words[index] < word :
                    words.insert(index,word)
    return words

words = dictionnaire(content,len(content),purcent)

2. Computing which authors (+ corresponding h index is we know it) used which word

In [None]:
def get_correspondance(words,paper_ids,content,h_indexes,papers):

    super_object = [[] for i in range(len(words))]

    authors = list(h_indexes.keys())
    J = len(authors)
    j = 0
    for author in authors :
        if j % (J//1000) == 0 :
            print(round(100*j/J,1),end="\r")
        h = h_indexes[author]
        for paper_id in papers[author]:
            paper_id = int(paper_id)
            index_in_content = bisect_left(paper_ids, paper_id)
            if index_in_content < len(content):
                paper_content = content[index_in_content]["InvertedIndex"].keys() #loss of info : how many time the word was used
                for word in paper_content :
                    word = re.sub(r"[^a-zA-Z]", "", word) # preprocessing the word
                    word = word.lower()
                    index = bisect_left(words, word)
                    if index < len(words) and words[index]==word :
                        super_object[index].append([author,paper_id,int(float(h))])
        j += 1
    
    for red in reds :
        for paper_id in papers[red]:
            paper_id = int(paper_id)
            index_in_content = bisect_left(paper_ids, paper_id)
            if index_in_content < len(content):
                paper_content = content[index_in_content]["InvertedIndex"].keys() #loss of info : how many time the word was used
                for word in paper_content :
                    word = re.sub(r"[^a-zA-Z]", "", word) # preprocessing the word
                    word = word.lower()
                    index = bisect_left(words, word)
                    if index < len(words) and words[index]==word :
                        super_object[index].append([red,paper_id,None)])
    return super_object

correspondance = get_correspondance(words,paper_ids,content,h_indexes,papers)

In [None]:
# 2.b) let's see what correspondance looks like

def create_eff_means(correspondance):
    W = len(correspondance)
    means = numpy.zeros(W)
    eff = numpy.zeros(W)
    i = 0
    for i in range(W) :
        eff[i] = len(correspondance[i])
        if eff[i] != 0 :
            means[i] = sum(correspondance[i])/eff[i]
        else :
            means[i] = -1
    return eff, means

effectifs,means = create_eff_means(correspondance)
# most used word ?
print(words[numpy.argmax(effectifs)])
# proportion of words used in less than 5 papers ?
print(round(100*effectifs[<5].size()/len(words),1)," %")
# how many words are used in more than 10% of the papers ?
print(effectifs[>len(words)/10].size())
# what word has the highest "mean h index" ?
print(words[numpy.argmax(means)])

# We could even draw some diagrams !

In [None]:
# Optionnal : saving our results
results = [[words[i],correspondance[i],means[i],eff[i]] for i in range(len(words))]
dataframe = pd.DataFrame(results,columns=["word","correspondance","means","effectifs"])
dataframe.to_csv(path+"AnalysisOfAbstracts_results_"+str(purcent)+".csv")

3. Making a predictor based on our discoveries

In [3]:
def predict(authorID,correspondance):
    neighbour_h = []
    for i in range(len(correspondance)):
        indic = False
        for match in correspondance[i]:
            if match[0]==authorID :
                indic = True
        if indic :
            for match in correspondance[i]:
                h = match[2]
                if h != None : 
                    neighbour_h.append(h)
    return func(neighbour_h)
    
def func(neighbour_h):
    # Here we have a lot of possibilities ! we could even do some learning to compute a good function.
    # First, let's try with a classic : the mean
    return numpy.mean(numpy.array(neighbour_h))

In [None]:
# Let's try our predictor !
predictions = [[authorID,predict(authorID)] for authorID in reds]
df_pred = pd.DataFrame(predictions,columns=["author","hindex"])
df_pred.to_csv(path+"for_submission.csv")