In [1]:
import itertools
import random
import json
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path 
import sys
pd.options.mode.chained_assignment = None  # default='warn'
import os
import logging
import threading
import time

In [2]:
# Extractive all authors from the "neighbors" and "collaborations" files
PATH = '/home/leonardo/Desktop/PFE/'

collaborative_authors1 = set()
collaborative_authors2 = set()

for year in range(1990,2019):

    collaborators1 = json.load(open(Path(PATH + 'Data/neighbours/neighbours_'+str(year)+'.json')))
    collaborators2 = json.load(open(Path(PATH + 'Data/collaborations/collaborations_'+str(year)+'.json')))
    
    for coll in collaborators1.keys():
        collaborative_authors1.add(coll)
    
    for coll in collaborators2.keys():
        collaborative_authors2.add(coll)

print("num of authors in 'neighbours jsons' are:", len(collaborative_authors1))
print("num of authors in 'collaborations jsons' are:", len(collaborative_authors2))

num of authors in 'neighbours jsons' are: 258145
num of authors in 'collaborations jsons' are: 258145


In [3]:
# Authors IDs taken from neighbors/collaborations json 
authors = pd.DataFrame(data=collaborative_authors1, columns= ["ID"])
authors

Unnamed: 0,ID
0,55489688400
1,24512586400
2,7003998251
3,57197746565
4,24773497300
...,...
258140,55486695200
258141,6603493676
258142,55479743900
258143,23092317800


In [5]:
# function that extract the collaboration row for the previous dataframe of authors IDs
 # runned by threading because of the big amount of data
def df_builder(thread_number, authors):
    i = 0
    for author in authors["ID"]:

        # Store one occurence of each collaborator's ID 
        all_collaborators = set()
        all_collaborators.add(int(author))

        row = authors.loc[authors['ID'] == author]

        for year in range(1990,2019):

            collaborators = json.load(open(Path(PATH + 'Data/neighbours/neighbours_'+str(year)+'.json')))

            if(author in collaborators.keys()):

                for coll in collaborators[author]:
                    if(coll not in all_collaborators):
                        all_collaborators.add(coll)

            row[year] = len(all_collaborators)-1    

        i+=1
        if(i%1000 == 0):
            print ("thread", thread_number, "hase done " + str(i) + " authors on " + str(len(authors)))

        row.to_csv('myDATA/00-collaboration_df.csv', mode='a', header=False)
    print("thread", thread_number, "has finished")

In [6]:
## write to 'myDATA/collaboration_df.csv' the dataset of cumulative collaboration over years for each author

YEARS = [str(year) for year in range(1990,2019)]
YEARS.insert(0,"ID")

### Uncomment to create the dataset from beginning... 
#if os.path.exists(PATH + 'myDATA/collaboration_df.csv'):
#    os.remove(PATH + 'myDATA/collaboration_df.csv')
#df = pd.DataFrame(columns=YEARS)
#df.to_csv('myDATA/collaboration_df.csv', mode='a', header=True, index=False)

### ...or uncomment this to add missing authors to the collaboration dataset
collaboration_df = pd.read_csv('myDATA/00-collaboration_df.csv')
done_IDs = collaboration_df["ID"].map(str)
authors = authors[~authors.ID.isin(done_IDs)]

# Generate around 20 thread to build the collaboration csv
n=1
thread_n = 1
while(n<len(authors)):
    print("starting thread", thread_n)
    thread_n +=1
    threading.Thread(target=df_builder, args=(thread_n , authors.iloc[n:len(authors)//20+n])).start()
    n += len(authors)//20

starting thread 1
starting thread 2
starting thread 3
starting thread 4
starting thread 5
starting thread 6
starting thread 7
starting thread 8
starting thread 9
starting thread 10
starting thread 11
starting thread 12
starting thread 13
starting thread 14
starting thread 15
starting thread 16
starting thread 17
starting thread 18
starting thread 19
starting thread 20
starting thread 21
thread 22 has finished
thread 11 has finished
thread 13 has finished
thread 5 has finished
thread 19 has finished
thread 9 has finished
thread 7 has finished
thread 10 has finished
thread 21 has finished
thread 8 has finished
thread 3 has finished
thread 17 has finished
thread 18 has finished
thread 20 has finished
thread 6 has finished
thread 12 has finished
thread 15 has finished
thread 14 has finished
thread 16 has finished
thread 2 has finished
thread 4 has finished


In [9]:
## Computer science authors in France
collaboration_df = pd.read_csv('myDATA/00-collaboration_df.csv')
collaboration_df

Unnamed: 0,ID,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
12908,8958327900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6508297663,0,0,0,0,0,0,0,0,0,...,4,7,7,8,8,8,8,8,8,8
103257,34571759000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,5,5,5
25815,7004267341,0,0,0,0,0,0,0,0,0,...,10,10,10,16,16,16,16,16,16,16
64536,8642393600,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221372,26421828900,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
13806,25121758300,0,0,0,0,0,0,0,0,0,...,0,38,38,38,38,38,38,38,38,38
246610,7004602302,0,0,0,0,0,0,0,0,0,...,3,3,3,3,3,3,3,3,3,3
182933,55496081000,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
