In [1]:
import itertools
import random
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path 
import os
import sys
from IPython.display import clear_output
import logging
import threading
import time

In [2]:
YEARS = [str(year) for year in range(1990,2019)]

## Computer science authors in France
collaboration_df = pd.read_csv('myDATA/00-collaboration_df.csv')
collaboration_df

Unnamed: 0,ID,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
12908,8958327900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6508297663,0,0,0,0,0,0,0,0,0,...,4,7,7,8,8,8,8,8,8,8
103257,34571759000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,5,5,5
25815,7004267341,0,0,0,0,0,0,0,0,0,...,10,10,10,16,16,16,16,16,16,16
64536,8642393600,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12222,6507630481,0,0,0,0,0,0,0,0,0,...,18,18,18,18,18,29,29,29,29,29
193310,24577815500,0,0,0,0,0,0,0,0,0,...,4,4,4,4,6,13,16,16,16,70
36974,57195243976,0,0,0,0,0,0,0,0,0,...,0,3,3,3,3,3,3,3,8,8
12243,35328962100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,2,2,3


In [3]:
## retrieve the pubblication data(not collaborations)
file = '/home/leonardo/Desktop/PFE/Data/requested_data_on_authors.json'
d = json.load(open(Path(file)))

In [4]:
# Build a pandas dataframe from it
publications_df = pd.DataFrame.from_dict(d, orient='index')

In [5]:
# format it as callaboration_df
publications_df = publications_df.reset_index()
publications_df = publications_df.rename(columns={"index": "ID"})
publications_df['ID'] = publications_df['ID'].astype(int)

# keep only authors present in the collaborations dataframe
collab_IDs = collaboration_df["ID"]
publications_df = publications_df[publications_df.ID.isin(collab_IDs)]
publications_df

Unnamed: 0,ID,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
5,7003355588,2,2,2,1,4,0,5,5,0,...,7,4,4,15,11,7,11,9,8,6
29,56522848500,3,0,1,0,2,0,6,1,3,...,3,5,6,1,0,0,1,1,1,4
49,7004165433,5,1,1,2,10,5,6,2,6,...,4,3,11,7,6,10,6,3,3,4
79,6603870889,1,0,2,0,1,2,6,4,2,...,8,10,7,20,16,12,9,10,15,16
96,7005944861,10,10,3,7,8,8,4,15,9,...,9,8,12,10,20,19,17,12,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207020,57200496797,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2207039,15137130100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2207045,57196721826,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2207048,57196401698,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
## Find the starting collaboration year for each author
def get_start_Y(df, id):
    aut = df.loc[df["ID"] == id]
    start_year = 0
    for year in aut.columns[1:]:
        n_colls = aut[year].values[0]
        if(n_colls != 0 and start_year==0):
            start_year = year
    return start_year

In [7]:
# Filter inactive authors: no holes of given lenght in publication history
def filter_inactives(df, hole_lenght): 
    
    i=0
    tot=len(df["ID"])
    
    for id in df["ID"]:
        aut = df.loc[df["ID"] == id]
        ID = aut["ID"].values[0]
        start_y = int(get_start_Y(df, id))
        active = True

        if(start_y == 0):
            active = False
        
        # slide the hole window 
        while(start_y + hole_lenght-1 < 2018):            
            c = 0
            for j in range(0, hole_lenght):
                if(aut[str(start_y + j)].values[0] == 0):
                    c += 1
            if(c>=hole_lenght-1):
                active = False
                
            start_y += 1
        
        if(active == False):
            df = df[df['ID'] != ID]

        clear_output(wait=True)
        
        print(i, "/", tot, "for hole_lenght", hole_lenght)
        i += 1       
        

    for i in df.index:
        df.loc[i].to_json()

    return df

In [8]:
hole_lenght = 28
min_hole_l = 0

In [9]:
## filter out all inactive authors from the collaboration datasset, for different hole lenght
tmp_pub_df = publications_df
while(hole_lenght>min_hole_l):
          
    tmp_pub_df = filter_inactives(tmp_pub_df, hole_lenght)
    active_IDs = tmp_pub_df["ID"]
    
    df = collaboration_df[collaboration_df.ID.isin(active_IDs)]
    df.to_csv('myDATA/05-filtered_by_hole_size/filtered_by_hole_size_'+str(hole_lenght-1)+'.csv', index=False)
    
    hole_lenght-=1

179788 / 179789 for hole_lenght 10


In [13]:
# Read active authors by hole lenght 

for size in range(0, 29):
    path = 'myDATA/05-filtered_by_hole_size/filtered_by_hole_size_'+str(size)+'.csv'
    if os.path.exists(path):
        df = pd.read_csv(path)
        print("for hole os size", size, "the number of authors is", len(df))

for hole os size 0 the number of authors is 11468
for hole os size 1 the number of authors is 36066
for hole os size 2 the number of authors is 62787
for hole os size 3 the number of authors is 86594
for hole os size 4 the number of authors is 106386
for hole os size 5 the number of authors is 123080
for hole os size 6 the number of authors is 137531
for hole os size 7 the number of authors is 149911
for hole os size 8 the number of authors is 160849
for hole os size 9 the number of authors is 170717
for hole os size 10 the number of authors is 179789
for hole os size 11 the number of authors is 187486
for hole os size 12 the number of authors is 194381
for hole os size 13 the number of authors is 200058
for hole os size 14 the number of authors is 204601
for hole os size 15 the number of authors is 208677
for hole os size 16 the number of authors is 212218
for hole os size 17 the number of authors is 215271
for hole os size 18 the number of authors is 218084
for hole os size 19 the nu